From b09ecfb8fe43c34abe2df4ad9a5347dec3702a95 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 00:27:12 +0000 Subject: [PATCH 01/49] feat: add destination readback introspection for smoke tests - Add destination_readback.py module with stats-level readback: table row counts, column names/types, per-column null/non-null stats - Map 5 destinations to cache implementations (BigQuery, DuckDB, MotherDuck, Postgres, Snowflake) for ~95% user coverage - Deterministic table name resolution per destination - Integrate readback into run_destination_smoke_test() (on by default) - Fix paired_destination_name bugs in Snowflake, Postgres, MotherDuck - Update CLI and MCP tool docstrings to reflect readback capability Co-Authored-By: AJ Steers --- airbyte/_util/destination_readback.py | 537 +++++++++++++++++++++++ airbyte/_util/destination_smoke_tests.py | 67 ++- airbyte/caches/motherduck.py | 2 +- airbyte/caches/postgres.py | 2 +- airbyte/caches/snowflake.py | 2 +- airbyte/cli/pyab.py | 8 +- airbyte/mcp/local.py | 12 +- 7 files changed, 615 insertions(+), 15 deletions(-) create mode 100644 airbyte/_util/destination_readback.py diff --git a/airbyte/_util/destination_readback.py b/airbyte/_util/destination_readback.py new file mode 100644 index 000000000..87d7f80d9 --- /dev/null +++ b/airbyte/_util/destination_readback.py @@ -0,0 +1,537 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Destination readback introspection for smoke tests. + +This module provides the ability to read back data written by a destination +connector and produce stats-level reports: table row counts, column names +and types, and per-column null/non-null counts. + +The readback leverages PyAirbyte's existing cache implementations to query +the same backends that destinations write to. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any + +from pydantic import BaseModel + +from airbyte._util.name_normalizers import LowerCaseNormalizer + + +if TYPE_CHECKING: + from collections.abc import Callable + + from airbyte.caches.base import CacheBase + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Destination-to-cache mapping +# --------------------------------------------------------------------------- + +# Maps destination connector names to cache class import paths. +# We use strings to avoid importing all cache classes at module load time. +_DESTINATION_TO_CACHE_INFO: dict[str, dict[str, str]] = { + "destination-bigquery": { + "module": "airbyte.caches.bigquery", + "class": "BigQueryCache", + }, + "destination-duckdb": { + "module": "airbyte.caches.duckdb", + "class": "DuckDBCache", + }, + "destination-motherduck": { + "module": "airbyte.caches.motherduck", + "class": "MotherDuckCache", + }, + "destination-postgres": { + "module": "airbyte.caches.postgres", + "class": "PostgresCache", + }, + "destination-snowflake": { + "module": "airbyte.caches.snowflake", + "class": "SnowflakeCache", + }, +} + +SUPPORTED_DESTINATIONS: frozenset[str] = frozenset(_DESTINATION_TO_CACHE_INFO.keys()) +"""Destination connector names that support readback introspection.""" + + +def _get_readback_supported(destination_name: str) -> bool: + """Return True if readback is supported for the given destination.""" + return destination_name in SUPPORTED_DESTINATIONS + + +# --------------------------------------------------------------------------- +# Deterministic table name resolution +# --------------------------------------------------------------------------- + +# Destinations normalize stream names into SQL table names. The logic +# differs per destination. We hard-code the known conventions here so +# that we can *optimistically* compute the expected table name without +# scanning the schema. + + +def _normalize_table_name_default(stream_name: str) -> str: + """Default normalizer: lowercase + replace non-alphanumeric with underscores. + + This matches the LowerCaseNormalizer used by most PyAirbyte caches and + aligns with how the Airbyte Java/Python destinations normalize names. + """ + return LowerCaseNormalizer.normalize(stream_name) + + +def _normalize_table_name_snowflake(stream_name: str) -> str: + """Snowflake normalizer: same as default (lowercase). + + Snowflake destinations use quoted identifiers in lowercase, + matching the LowerCaseNormalizer behavior. + """ + return LowerCaseNormalizer.normalize(stream_name) + + +def _normalize_table_name_bigquery(stream_name: str) -> str: + """BigQuery normalizer: same as default (lowercase). + + BigQuery destinations use backtick-quoted identifiers in lowercase. + """ + return LowerCaseNormalizer.normalize(stream_name) + + +_DESTINATION_TABLE_NORMALIZERS: dict[str, Callable[[str], str]] = { + "destination-bigquery": _normalize_table_name_bigquery, + "destination-duckdb": _normalize_table_name_default, + "destination-motherduck": _normalize_table_name_default, + "destination-postgres": _normalize_table_name_default, + "destination-snowflake": _normalize_table_name_snowflake, +} + + +def _get_table_normalizer( + destination_name: str, +) -> Callable[[str], str]: + """Return the table name normalizer for the given destination.""" + return _DESTINATION_TABLE_NORMALIZERS.get( + destination_name, + _normalize_table_name_default, + ) + + +def _resolve_expected_table_name( + destination_name: str, + stream_name: str, +) -> str: + """Deterministically resolve the expected SQL table name for a stream. + + This uses the destination's known naming conventions to predict + what the table name should be in the backend. + """ + normalizer = _get_table_normalizer(destination_name) + return normalizer(stream_name) + + +# --------------------------------------------------------------------------- +# Column name normalization +# --------------------------------------------------------------------------- + + +def _normalize_column_name_default(column_name: str) -> str: + """Default column normalizer: lowercase + replace non-alphanumeric with underscores.""" + return LowerCaseNormalizer.normalize(column_name) + + +def _resolve_expected_column_name( + destination_name: str, + column_name: str, +) -> str: + """Deterministically resolve the expected SQL column name. + + For now all destinations use the same LowerCaseNormalizer for columns. + """ + _ = destination_name # Reserved for per-destination overrides + return _normalize_column_name_default(column_name) + + +# --------------------------------------------------------------------------- +# Readback result models +# --------------------------------------------------------------------------- + + +class ColumnStats(BaseModel): + """Null/non-null statistics for a single column.""" + + column_name: str + """The column name as found in the destination.""" + + null_count: int + """Number of NULL values in this column.""" + + non_null_count: int + """Number of non-NULL values in this column.""" + + total_count: int + """Total row count (null_count + non_null_count).""" + + +class ColumnInfo(BaseModel): + """Column name and type information.""" + + column_name: str + """The column name as found in the destination.""" + + column_type: str + """The SQL data type name as reported by the database.""" + + +class TableInfo(BaseModel): + """Basic table info: name and row count.""" + + table_name: str + """The table name as found in the destination.""" + + row_count: int + """Number of rows in the table.""" + + expected_stream_name: str + """The original stream name that this table corresponds to.""" + + +class TableReadbackReport(BaseModel): + """Full readback report for a single table.""" + + table_name: str + """The table name as found in the destination.""" + + expected_stream_name: str + """The original stream name.""" + + row_count: int + """Number of rows found.""" + + columns: list[ColumnInfo] + """Column names and types.""" + + column_stats: list[ColumnStats] + """Per-column null/non-null statistics.""" + + +class DestinationReadbackResult(BaseModel): + """Result of reading back destination-written data. + + Contains three logical datasets: + 1. tables - list of tables with row counts + 2. columns - per-table column names and types + 3. column_stats - per-table, per-column null/non-null counts + """ + + destination: str + """The destination connector name.""" + + namespace: str + """The namespace (schema) that was inspected.""" + + readback_supported: bool + """Whether readback was supported for this destination.""" + + tables: list[TableInfo] + """Dataset 1: Tables found with row counts.""" + + table_reports: list[TableReadbackReport] + """Full per-table reports including columns and stats.""" + + tables_missing: list[str] + """Stream names for which the expected table was not found.""" + + error: str | None = None + """Error message if readback failed.""" + + def get_tables_summary(self) -> list[dict[str, Any]]: + """Return dataset 1: tables with row counts as plain dicts.""" + return [t.model_dump() for t in self.tables] + + def get_columns_summary(self) -> list[dict[str, Any]]: + """Return dataset 2: columns with types, grouped by table.""" + result = [] + for report in self.table_reports: + result.extend( + { + "table_name": report.table_name, + "column_name": col.column_name, + "column_type": col.column_type, + } + for col in report.columns + ) + return result + + def get_column_stats_summary(self) -> list[dict[str, Any]]: + """Return dataset 3: per-column null/non-null counts.""" + result = [] + for report in self.table_reports: + result.extend( + { + "table_name": report.table_name, + "column_name": stat.column_name, + "null_count": stat.null_count, + "non_null_count": stat.non_null_count, + "total_count": stat.total_count, + } + for stat in report.column_stats + ) + return result + + +# --------------------------------------------------------------------------- +# Cache construction from destination config +# --------------------------------------------------------------------------- + + +def _build_readback_cache( + destination_name: str, + destination_config: dict[str, Any], + namespace: str, +) -> CacheBase: + """Construct a cache instance that can query the destination's backend. + + The cache is configured to point at the same backend the destination + wrote to, using the supplied namespace as the schema. + + Raises: + NotImplementedError: If the destination is not supported. + """ + from airbyte.destinations._translate_dest_to_cache import ( # noqa: PLC0415 + destination_to_cache, + ) + + if destination_name not in SUPPORTED_DESTINATIONS: + raise NotImplementedError( + f"Readback is not supported for '{destination_name}'. " + f"Supported destinations: {sorted(SUPPORTED_DESTINATIONS)}" + ) + + # The destination_to_cache function expects the config to have + # a 'destinationType' field. We ensure it's present. + config_with_type = dict(destination_config) + if "destinationType" not in config_with_type and "DESTINATION_TYPE" not in config_with_type: + # Infer the type from the destination name + dest_type = destination_name.replace("destination-", "") + config_with_type["destinationType"] = dest_type + + cache = destination_to_cache(config_with_type) + + # Override the schema to match the namespace used by the smoke test + if hasattr(cache, "schema_name"): + # Use model_copy to create a new instance with updated schema + cache = cache.model_copy(update={"schema_name": namespace}) + + return cache + + +# --------------------------------------------------------------------------- +# Core readback logic +# --------------------------------------------------------------------------- + + +def _query_table_row_count( + cache: CacheBase, + table_name: str, +) -> int | None: + """Query the row count for a table. Returns None if the table doesn't exist.""" + try: + result = cache.run_sql_query( + f"SELECT COUNT(*) AS row_count FROM {cache.schema_name}.{table_name}", + ) + if result: + return int(result[0]["row_count"]) + return 0 # noqa: TRY300 + except Exception: + logger.debug("Table %s.%s not found or not accessible.", cache.schema_name, table_name) + return None + + +def _query_column_info( + cache: CacheBase, + table_name: str, +) -> list[ColumnInfo]: + """Query column names and types for a table. + + Uses a SELECT with LIMIT 0 to get column metadata from the result set, + avoiding INFORMATION_SCHEMA scanning. + """ + try: + # We use the SQLAlchemy engine's inspector for column info + import sqlalchemy # noqa: PLC0415 + + engine = cache.get_sql_engine() + inspector = sqlalchemy.inspect(engine) + columns = inspector.get_columns(table_name, schema=cache.schema_name) + return [ + ColumnInfo( + column_name=col["name"], + column_type=str(col["type"]), + ) + for col in columns + ] + except Exception: + logger.debug( + "Could not get column info for %s.%s", + cache.schema_name, + table_name, + ) + return [] + + +def _query_column_stats( + cache: CacheBase, + table_name: str, + columns: list[ColumnInfo], +) -> list[ColumnStats]: + """Query per-column null/non-null counts.""" + if not columns: + return [] + + # Build a SQL query that computes COUNT(*), COUNT(col) for each column + # COUNT(*) gives total rows, COUNT(col) gives non-null count + count_exprs = [] + for col in columns: + col_name = col.column_name + # Quote column names to handle special characters and reserved words + quoted = f'"{col_name}"' + count_exprs.append(f"COUNT({quoted}) AS non_null_{col_name}") + + count_exprs_str = ", ".join(count_exprs) + sql = ( + f"SELECT COUNT(*) AS total_rows, {count_exprs_str} " + f"FROM {cache.schema_name}.{table_name}" + ) + + try: + result = cache.run_sql_query(sql) + except Exception: + logger.debug( + "Could not query column stats for %s.%s", + cache.schema_name, + table_name, + ) + return [] + + if not result: + return [] + + row = result[0] + total_rows = int(row["total_rows"]) + + stats = [] + for col in columns: + non_null_key = f"non_null_{col.column_name}" + non_null_count = int(row.get(non_null_key, 0)) + stats.append( + ColumnStats( + column_name=col.column_name, + null_count=total_rows - non_null_count, + non_null_count=non_null_count, + total_count=total_rows, + ) + ) + + return stats + + +def run_destination_readback( + *, + destination_name: str, + destination_config: dict[str, Any], + namespace: str, + stream_names: list[str], +) -> DestinationReadbackResult: + """Read back data from a destination after a smoke test and produce stats. + + This is the main entry point for readback introspection. It: + 1. Constructs a cache that can query the destination's backend + 2. For each expected stream, resolves the expected table name + 3. Queries row counts, column info, and column stats + + Returns a ``DestinationReadbackResult`` with three datasets: + - tables: table names + row counts + - columns: column names + types per table + - column_stats: null/non-null counts per column per table + + If the destination is not supported for readback, returns a result + with ``readback_supported=False`` and empty data. + """ + if not _get_readback_supported(destination_name): + return DestinationReadbackResult( + destination=destination_name, + namespace=namespace, + readback_supported=False, + tables=[], + table_reports=[], + tables_missing=stream_names, + ) + + try: + cache = _build_readback_cache( + destination_name=destination_name, + destination_config=destination_config, + namespace=namespace, + ) + except Exception as ex: + logger.warning("Failed to build readback cache for %s: %s", destination_name, ex) + return DestinationReadbackResult( + destination=destination_name, + namespace=namespace, + readback_supported=True, + tables=[], + table_reports=[], + tables_missing=stream_names, + error=f"Failed to build readback cache: {ex}", + ) + + tables: list[TableInfo] = [] + table_reports: list[TableReadbackReport] = [] + tables_missing: list[str] = [] + + for stream_name in stream_names: + expected_table = _resolve_expected_table_name(destination_name, stream_name) + + # Optimistic: try to query the expected table directly + row_count = _query_table_row_count(cache, expected_table) + + if row_count is None: + tables_missing.append(stream_name) + continue + + tables.append( + TableInfo( + table_name=expected_table, + row_count=row_count, + expected_stream_name=stream_name, + ) + ) + + # Get column info + columns = _query_column_info(cache, expected_table) + + # Get column stats + column_stats = _query_column_stats(cache, expected_table, columns) + + table_reports.append( + TableReadbackReport( + table_name=expected_table, + expected_stream_name=stream_name, + row_count=row_count, + columns=columns, + column_stats=column_stats, + ) + ) + + return DestinationReadbackResult( + destination=destination_name, + namespace=namespace, + readback_supported=True, + tables=tables, + table_reports=table_reports, + tables_missing=tables_missing, + ) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index 67c87adbb..bb65c3d77 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -7,11 +7,17 @@ Smoke tests send synthetic data from the built-in smoke test source to a destination connector and report whether the destination accepted the data -without errors. No readback or comparison is performed. +without errors. + +When the destination has a compatible cache implementation, readback +introspection is automatically performed to produce stats on the written +data: table row counts, column names/types, and per-column null/non-null +counts. """ from __future__ import annotations +import logging import time from datetime import datetime, timezone from pathlib import Path @@ -21,9 +27,16 @@ from pydantic import BaseModel from airbyte import get_source +from airbyte._util.destination_readback import ( + DestinationReadbackResult, + run_destination_readback, +) from airbyte.exceptions import PyAirbyteInputError +logger = logging.getLogger(__name__) + + NAMESPACE_PREFIX = "zz_deleteme" """Prefix for auto-generated smoke test namespaces. @@ -83,6 +96,17 @@ class DestinationSmokeTestResult(BaseModel): error: str | None = None """Error message if the smoke test failed.""" + readback_result: DestinationReadbackResult | None = None + """Readback introspection result, if supported for this destination. + + Contains three datasets: + 1. tables - table names with row counts + 2. columns - column names and types per table + 3. column_stats - per-column null/non-null counts + + None if the write itself failed or readback is not supported. + """ + def get_smoke_test_source( *, @@ -194,6 +218,12 @@ def _sanitize_error(ex: Exception) -> str: return f"{type(ex).__name__}: {ex}" +def _get_stream_names_from_source(source_obj: Source) -> list[str]: + """Extract stream names from a configured source.""" + catalog = source_obj.get_configured_catalog() + return [stream.stream.name for stream in catalog.streams] + + def run_destination_smoke_test( *, destination: Destination, @@ -208,9 +238,11 @@ def run_destination_smoke_test( Sends synthetic test data from the smoke test source to the specified destination and returns a structured result. - This function does NOT read back data from the destination or compare - results. It only verifies that the destination accepts the data without - errors. + When the destination has a compatible cache implementation, readback + introspection is automatically performed after a successful write. + The readback produces stats on the written data (table row counts, + column names/types, and per-column null/non-null counts) and is + included in the result as ``readback_result``. `destination` is a resolved `Destination` object ready for writing. @@ -244,6 +276,9 @@ def run_destination_smoke_test( custom_scenarios_file=custom_scenarios_file, ) + # Capture stream names for readback before the write consumes the source + stream_names = _get_stream_names_from_source(source_obj) + # Normalize scenarios to a display string if isinstance(scenarios, list): scenarios_str = ",".join(scenarios) if scenarios else "fast" @@ -267,6 +302,29 @@ def run_destination_smoke_test( elapsed = time.monotonic() - start_time + # Perform readback introspection if the write succeeded + readback_result: DestinationReadbackResult | None = None + if success: + try: + destination_config = destination.get_config() + readback_result = run_destination_readback( + destination_name=destination.name, + destination_config=destination_config, + namespace=namespace, + stream_names=stream_names, + ) + except NotImplementedError: + logger.info( + "Readback not supported for destination '%s'.", + destination.name, + ) + except Exception as ex: + logger.warning( + "Readback failed for destination '%s': %s", + destination.name, + ex, + ) + return DestinationSmokeTestResult( success=success, destination=destination.name, @@ -275,4 +333,5 @@ def run_destination_smoke_test( scenarios_requested=scenarios_str, elapsed_seconds=round(elapsed, 2), error=error_message, + readback_result=readback_result, ) diff --git a/airbyte/caches/motherduck.py b/airbyte/caches/motherduck.py index 3fcff8e31..7029724e8 100644 --- a/airbyte/caches/motherduck.py +++ b/airbyte/caches/motherduck.py @@ -73,7 +73,7 @@ class MotherDuckCache(MotherDuckConfig, DuckDBCache): _sql_processor_class: ClassVar[type[SqlProcessorBase]] = MotherDuckSqlProcessor - paired_destination_name: ClassVar[str | None] = "destination-bigquery" + paired_destination_name: ClassVar[str | None] = "destination-motherduck" paired_destination_config_class: ClassVar[type | None] = DestinationDuckdb @property diff --git a/airbyte/caches/postgres.py b/airbyte/caches/postgres.py index db66bbc7d..e153309be 100644 --- a/airbyte/caches/postgres.py +++ b/airbyte/caches/postgres.py @@ -42,7 +42,7 @@ class PostgresCache(PostgresConfig, CacheBase): _sql_processor_class: ClassVar[type[SqlProcessorBase]] = PostgresSqlProcessor - paired_destination_name: ClassVar[str | None] = "destination-bigquery" + paired_destination_name: ClassVar[str | None] = "destination-postgres" paired_destination_config_class: ClassVar[type | None] = DestinationPostgres @property diff --git a/airbyte/caches/snowflake.py b/airbyte/caches/snowflake.py index 2bf5485cb..82e59e513 100644 --- a/airbyte/caches/snowflake.py +++ b/airbyte/caches/snowflake.py @@ -78,7 +78,7 @@ class SnowflakeCache(SnowflakeConfig, CacheBase): _sql_processor_class: ClassVar[type[SqlProcessorBase]] = SnowflakeSqlProcessor - paired_destination_name: ClassVar[str | None] = "destination-bigquery" + paired_destination_name: ClassVar[str | None] = "destination-snowflake" paired_destination_config_class: ClassVar[type | None] = DestinationSnowflake @property diff --git a/airbyte/cli/pyab.py b/airbyte/cli/pyab.py index f71ca5b5a..f179ff307 100644 --- a/airbyte/cli/pyab.py +++ b/airbyte/cli/pyab.py @@ -729,9 +729,11 @@ def destination_smoke_test( failure patterns: type variations, null handling, naming edge cases, schema variations, and batch sizes. - This command does NOT read back data from the destination or compare - results. It only verifies that the destination accepts the data without - errors. + When the destination has a compatible cache implementation (DuckDB, + Postgres, Snowflake, BigQuery, MotherDuck), readback introspection + is automatically performed after a successful write. The readback + produces stats on the written data: table row counts, column + names/types, and per-column null/non-null counts. Usage examples: diff --git a/airbyte/mcp/local.py b/airbyte/mcp/local.py index 7df085365..9b7ec53e8 100644 --- a/airbyte/mcp/local.py +++ b/airbyte/mcp/local.py @@ -543,9 +543,7 @@ def get_stream_previews( ) source.set_config(config_dict) - streams_param: list[str] | Literal["*"] | None = resolve_list_of_strings( - streams - ) # pyrefly: ignore[no-matching-overload] + streams_param: list[str] | Literal["*"] | None = resolve_list_of_strings(streams) # pyrefly: ignore[no-matching-overload] if streams_param and len(streams_param) == 1 and streams_param[0] == "*": streams_param = "*" @@ -910,8 +908,12 @@ def destination_smoke_test( # noqa: PLR0913, PLR0917 type variations, null handling, naming edge cases, schema variations, and batch sizes. - This tool does NOT read back data from the destination or compare results. - It only verifies that the destination accepts the data without errors. + When the destination has a compatible cache implementation (DuckDB, + Postgres, Snowflake, BigQuery, MotherDuck), readback introspection is + automatically performed after a successful write. The readback produces + stats on the written data: table row counts, column names/types, and + per-column null/non-null counts. Results are included in the response + as ``readback_result``. """ # Resolve destination config config_dict = resolve_connector_config( From 8c0385b96e79f45a46f4b1c568608b51dd6f133a Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 00:29:49 +0000 Subject: [PATCH 02/49] fix: address CI lint and type check failures - Fix pyrefly type error: explicitly type count_exprs as list[str] - Restore ruff-compliant formatting on mcp/local.py line 546 Co-Authored-By: AJ Steers --- airbyte/_util/destination_readback.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/airbyte/_util/destination_readback.py b/airbyte/_util/destination_readback.py index 87d7f80d9..97a26fc26 100644 --- a/airbyte/_util/destination_readback.py +++ b/airbyte/_util/destination_readback.py @@ -394,7 +394,7 @@ def _query_column_stats( # Build a SQL query that computes COUNT(*), COUNT(col) for each column # COUNT(*) gives total rows, COUNT(col) gives non-null count - count_exprs = [] + count_exprs: list[str] = [] for col in columns: col_name = col.column_name # Quote column names to handle special characters and reserved words @@ -402,10 +402,7 @@ def _query_column_stats( count_exprs.append(f"COUNT({quoted}) AS non_null_{col_name}") count_exprs_str = ", ".join(count_exprs) - sql = ( - f"SELECT COUNT(*) AS total_rows, {count_exprs_str} " - f"FROM {cache.schema_name}.{table_name}" - ) + sql = f"SELECT COUNT(*) AS total_rows, {count_exprs_str} FROM {cache.schema_name}.{table_name}" try: result = cache.run_sql_query(sql) From 4faa641a742aae1758f788ef8398bc381e771c8f Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 00:31:01 +0000 Subject: [PATCH 03/49] fix: use uv run ruff format for CI-compatible formatting Co-Authored-By: AJ Steers --- airbyte/mcp/local.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte/mcp/local.py b/airbyte/mcp/local.py index 9b7ec53e8..4ba136f31 100644 --- a/airbyte/mcp/local.py +++ b/airbyte/mcp/local.py @@ -543,7 +543,9 @@ def get_stream_previews( ) source.set_config(config_dict) - streams_param: list[str] | Literal["*"] | None = resolve_list_of_strings(streams) # pyrefly: ignore[no-matching-overload] + streams_param: list[str] | Literal["*"] | None = resolve_list_of_strings( + streams + ) # pyrefly: ignore[no-matching-overload] if streams_param and len(streams_param) == 1 and streams_param[0] == "*": streams_param = "*" From 1a514c486f8fe4908d8f7527ff0cddc00a91bf81 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 00:43:28 +0000 Subject: [PATCH 04/49] refactor: use cache's built-in normalizers and move query methods to CacheBase Co-Authored-By: AJ Steers --- airbyte/_util/destination_readback.py | 246 ++++---------------------- airbyte/caches/base.py | 122 +++++++++++++ 2 files changed, 158 insertions(+), 210 deletions(-) diff --git a/airbyte/_util/destination_readback.py b/airbyte/_util/destination_readback.py index 97a26fc26..c995b64a3 100644 --- a/airbyte/_util/destination_readback.py +++ b/airbyte/_util/destination_readback.py @@ -6,7 +6,8 @@ and types, and per-column null/non-null counts. The readback leverages PyAirbyte's existing cache implementations to query -the same backends that destinations write to. +the same backends that destinations write to. Table and column name +normalization is delegated to the cache's built-in SQL processor normalizer. """ from __future__ import annotations @@ -16,12 +17,8 @@ from pydantic import BaseModel -from airbyte._util.name_normalizers import LowerCaseNormalizer - if TYPE_CHECKING: - from collections.abc import Callable - from airbyte.caches.base import CacheBase logger = logging.getLogger(__name__) @@ -65,96 +62,6 @@ def _get_readback_supported(destination_name: str) -> bool: return destination_name in SUPPORTED_DESTINATIONS -# --------------------------------------------------------------------------- -# Deterministic table name resolution -# --------------------------------------------------------------------------- - -# Destinations normalize stream names into SQL table names. The logic -# differs per destination. We hard-code the known conventions here so -# that we can *optimistically* compute the expected table name without -# scanning the schema. - - -def _normalize_table_name_default(stream_name: str) -> str: - """Default normalizer: lowercase + replace non-alphanumeric with underscores. - - This matches the LowerCaseNormalizer used by most PyAirbyte caches and - aligns with how the Airbyte Java/Python destinations normalize names. - """ - return LowerCaseNormalizer.normalize(stream_name) - - -def _normalize_table_name_snowflake(stream_name: str) -> str: - """Snowflake normalizer: same as default (lowercase). - - Snowflake destinations use quoted identifiers in lowercase, - matching the LowerCaseNormalizer behavior. - """ - return LowerCaseNormalizer.normalize(stream_name) - - -def _normalize_table_name_bigquery(stream_name: str) -> str: - """BigQuery normalizer: same as default (lowercase). - - BigQuery destinations use backtick-quoted identifiers in lowercase. - """ - return LowerCaseNormalizer.normalize(stream_name) - - -_DESTINATION_TABLE_NORMALIZERS: dict[str, Callable[[str], str]] = { - "destination-bigquery": _normalize_table_name_bigquery, - "destination-duckdb": _normalize_table_name_default, - "destination-motherduck": _normalize_table_name_default, - "destination-postgres": _normalize_table_name_default, - "destination-snowflake": _normalize_table_name_snowflake, -} - - -def _get_table_normalizer( - destination_name: str, -) -> Callable[[str], str]: - """Return the table name normalizer for the given destination.""" - return _DESTINATION_TABLE_NORMALIZERS.get( - destination_name, - _normalize_table_name_default, - ) - - -def _resolve_expected_table_name( - destination_name: str, - stream_name: str, -) -> str: - """Deterministically resolve the expected SQL table name for a stream. - - This uses the destination's known naming conventions to predict - what the table name should be in the backend. - """ - normalizer = _get_table_normalizer(destination_name) - return normalizer(stream_name) - - -# --------------------------------------------------------------------------- -# Column name normalization -# --------------------------------------------------------------------------- - - -def _normalize_column_name_default(column_name: str) -> str: - """Default column normalizer: lowercase + replace non-alphanumeric with underscores.""" - return LowerCaseNormalizer.normalize(column_name) - - -def _resolve_expected_column_name( - destination_name: str, - column_name: str, -) -> str: - """Deterministically resolve the expected SQL column name. - - For now all destinations use the same LowerCaseNormalizer for columns. - """ - _ = destination_name # Reserved for per-destination overrides - return _normalize_column_name_default(column_name) - - # --------------------------------------------------------------------------- # Readback result models # --------------------------------------------------------------------------- @@ -329,113 +236,6 @@ def _build_readback_cache( return cache -# --------------------------------------------------------------------------- -# Core readback logic -# --------------------------------------------------------------------------- - - -def _query_table_row_count( - cache: CacheBase, - table_name: str, -) -> int | None: - """Query the row count for a table. Returns None if the table doesn't exist.""" - try: - result = cache.run_sql_query( - f"SELECT COUNT(*) AS row_count FROM {cache.schema_name}.{table_name}", - ) - if result: - return int(result[0]["row_count"]) - return 0 # noqa: TRY300 - except Exception: - logger.debug("Table %s.%s not found or not accessible.", cache.schema_name, table_name) - return None - - -def _query_column_info( - cache: CacheBase, - table_name: str, -) -> list[ColumnInfo]: - """Query column names and types for a table. - - Uses a SELECT with LIMIT 0 to get column metadata from the result set, - avoiding INFORMATION_SCHEMA scanning. - """ - try: - # We use the SQLAlchemy engine's inspector for column info - import sqlalchemy # noqa: PLC0415 - - engine = cache.get_sql_engine() - inspector = sqlalchemy.inspect(engine) - columns = inspector.get_columns(table_name, schema=cache.schema_name) - return [ - ColumnInfo( - column_name=col["name"], - column_type=str(col["type"]), - ) - for col in columns - ] - except Exception: - logger.debug( - "Could not get column info for %s.%s", - cache.schema_name, - table_name, - ) - return [] - - -def _query_column_stats( - cache: CacheBase, - table_name: str, - columns: list[ColumnInfo], -) -> list[ColumnStats]: - """Query per-column null/non-null counts.""" - if not columns: - return [] - - # Build a SQL query that computes COUNT(*), COUNT(col) for each column - # COUNT(*) gives total rows, COUNT(col) gives non-null count - count_exprs: list[str] = [] - for col in columns: - col_name = col.column_name - # Quote column names to handle special characters and reserved words - quoted = f'"{col_name}"' - count_exprs.append(f"COUNT({quoted}) AS non_null_{col_name}") - - count_exprs_str = ", ".join(count_exprs) - sql = f"SELECT COUNT(*) AS total_rows, {count_exprs_str} FROM {cache.schema_name}.{table_name}" - - try: - result = cache.run_sql_query(sql) - except Exception: - logger.debug( - "Could not query column stats for %s.%s", - cache.schema_name, - table_name, - ) - return [] - - if not result: - return [] - - row = result[0] - total_rows = int(row["total_rows"]) - - stats = [] - for col in columns: - non_null_key = f"non_null_{col.column_name}" - non_null_count = int(row.get(non_null_key, 0)) - stats.append( - ColumnStats( - column_name=col.column_name, - null_count=total_rows - non_null_count, - non_null_count=non_null_count, - total_count=total_rows, - ) - ) - - return stats - - def run_destination_readback( *, destination_name: str, @@ -447,8 +247,9 @@ def run_destination_readback( This is the main entry point for readback introspection. It: 1. Constructs a cache that can query the destination's backend - 2. For each expected stream, resolves the expected table name - 3. Queries row counts, column info, and column stats + 2. For each expected stream, resolves the expected table name using the + cache's built-in processor normalizer + 3. Queries row counts, column info, and column stats via cache methods Returns a ``DestinationReadbackResult`` with three datasets: - tables: table names + row counts @@ -491,10 +292,14 @@ def run_destination_readback( tables_missing: list[str] = [] for stream_name in stream_names: - expected_table = _resolve_expected_table_name(destination_name, stream_name) + # Use the cache's processor normalizer to resolve the expected table name. + # This delegates to the same normalizer the cache uses for writing. + expected_table = cache.processor.get_sql_table_name(stream_name) # Optimistic: try to query the expected table directly - row_count = _query_table_row_count(cache, expected_table) + row_count = cache._readback_query_row_count( # noqa: SLF001 + expected_table, + ) if row_count is None: tables_missing.append(stream_name) @@ -508,11 +313,32 @@ def run_destination_readback( ) ) - # Get column info - columns = _query_column_info(cache, expected_table) + # Get column info via cache method + raw_columns = cache._readback_query_column_info( # noqa: SLF001 + expected_table, + ) + columns = [ + ColumnInfo( + column_name=c["column_name"], + column_type=c["column_type"], + ) + for c in raw_columns + ] - # Get column stats - column_stats = _query_column_stats(cache, expected_table, columns) + # Get column stats via cache method + raw_stats = cache._readback_query_column_stats( # noqa: SLF001 + expected_table, + raw_columns, + ) + column_stats = [ + ColumnStats( + column_name=s["column_name"], + null_count=s["null_count"], + non_null_count=s["non_null_count"], + total_count=s["total_count"], + ) + for s in raw_stats + ] table_reports.append( TableReadbackReport( diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index bb585184a..21255570d 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -439,6 +439,128 @@ def __iter__( # type: ignore [override] # Overriding Pydantic model method """Iterate over the streams in the cache.""" return ((name, dataset) for name, dataset in self.streams.items()) + # ---- Readback introspection helpers ---- + # These private methods support destination readback introspection, + # allowing the cache to query stats about data written by a destination. + + def _readback_query_row_count( + self, + table_name: str, + ) -> int | None: + """Query the row count for a table. Returns None if the table doesn't exist.""" + import logging # noqa: PLC0415 + + try: + result = self.run_sql_query( + f"SELECT COUNT(*) AS row_count FROM {self.schema_name}.{table_name}", + ) + if result: + return int(result[0]["row_count"]) + return 0 # noqa: TRY300 + except Exception: + logging.getLogger(__name__).debug( + "Table %s.%s not found or not accessible.", + self.schema_name, + table_name, + ) + return None + + def _readback_query_column_info( + self, + table_name: str, + ) -> list[dict[str, str]]: + """Query column names and types for a table. + + Returns a list of dicts with 'column_name' and 'column_type' keys. + """ + import logging # noqa: PLC0415 + + import sqlalchemy as sa # noqa: PLC0415 + + try: + engine = self.get_sql_engine() + inspector = sa.inspect(engine) + columns = inspector.get_columns(table_name, schema=self.schema_name) + return [ + { + "column_name": col["name"], + "column_type": str(col["type"]), + } + for col in columns + ] + except Exception: + logging.getLogger(__name__).debug( + "Could not get column info for %s.%s", + self.schema_name, + table_name, + ) + return [] + + def _readback_query_column_stats( + self, + table_name: str, + columns: list[dict[str, str]], + ) -> list[dict[str, Any]]: + """Query per-column null/non-null counts. + + Args: + table_name: The table to query. + columns: List of dicts with at least a 'column_name' key. + + Returns a list of dicts with column_name, null_count, non_null_count, + total_count keys. + """ + import logging # noqa: PLC0415 + + if not columns: + return [] + + # Build a SQL query that computes COUNT(*), COUNT(col) for each column. + # COUNT(*) gives total rows, COUNT(col) gives non-null count. + count_exprs: list[str] = [] + for col in columns: + col_name = col["column_name"] + quoted = f'"{col_name}"' + count_exprs.append(f"COUNT({quoted}) AS non_null_{col_name}") + + count_exprs_str = ", ".join(count_exprs) + sql = ( + f"SELECT COUNT(*) AS total_rows, {count_exprs_str} " + f"FROM {self.schema_name}.{table_name}" + ) + + try: + result = self.run_sql_query(sql) + except Exception: + logging.getLogger(__name__).debug( + "Could not query column stats for %s.%s", + self.schema_name, + table_name, + ) + return [] + + if not result: + return [] + + row = result[0] + total_rows = int(row["total_rows"]) + + stats = [] + for col in columns: + col_name = col["column_name"] + non_null_key = f"non_null_{col_name}" + non_null_count = int(row.get(non_null_key, 0)) + stats.append( + { + "column_name": col_name, + "null_count": total_rows - non_null_count, + "non_null_count": non_null_count, + "total_count": total_rows, + } + ) + + return stats + def _write_airbyte_message_stream( self, stdin: IO[str] | AirbyteMessageIterator, From 6740c3d20cd26b9a880ffb9129c91467060620c2 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 01:59:30 +0000 Subject: [PATCH 05/49] refactor: add get_sql_cache() to Destination, simplify readback to use it Co-Authored-By: AJ Steers --- airbyte/_util/destination_readback.py | 109 +++-------------------- airbyte/_util/destination_smoke_tests.py | 4 +- airbyte/destinations/base.py | 74 +++++++++++++++ 3 files changed, 88 insertions(+), 99 deletions(-) diff --git a/airbyte/_util/destination_readback.py b/airbyte/_util/destination_readback.py index c995b64a3..723a9d36f 100644 --- a/airbyte/_util/destination_readback.py +++ b/airbyte/_util/destination_readback.py @@ -19,49 +19,11 @@ if TYPE_CHECKING: - from airbyte.caches.base import CacheBase + from airbyte.destinations.base import Destination logger = logging.getLogger(__name__) -# --------------------------------------------------------------------------- -# Destination-to-cache mapping -# --------------------------------------------------------------------------- - -# Maps destination connector names to cache class import paths. -# We use strings to avoid importing all cache classes at module load time. -_DESTINATION_TO_CACHE_INFO: dict[str, dict[str, str]] = { - "destination-bigquery": { - "module": "airbyte.caches.bigquery", - "class": "BigQueryCache", - }, - "destination-duckdb": { - "module": "airbyte.caches.duckdb", - "class": "DuckDBCache", - }, - "destination-motherduck": { - "module": "airbyte.caches.motherduck", - "class": "MotherDuckCache", - }, - "destination-postgres": { - "module": "airbyte.caches.postgres", - "class": "PostgresCache", - }, - "destination-snowflake": { - "module": "airbyte.caches.snowflake", - "class": "SnowflakeCache", - }, -} - -SUPPORTED_DESTINATIONS: frozenset[str] = frozenset(_DESTINATION_TO_CACHE_INFO.keys()) -"""Destination connector names that support readback introspection.""" - - -def _get_readback_supported(destination_name: str) -> bool: - """Return True if readback is supported for the given destination.""" - return destination_name in SUPPORTED_DESTINATIONS - - # --------------------------------------------------------------------------- # Readback result models # --------------------------------------------------------------------------- @@ -191,62 +153,21 @@ def get_column_stats_summary(self) -> list[dict[str, Any]]: # --------------------------------------------------------------------------- -# Cache construction from destination config +# Readback orchestration # --------------------------------------------------------------------------- -def _build_readback_cache( - destination_name: str, - destination_config: dict[str, Any], - namespace: str, -) -> CacheBase: - """Construct a cache instance that can query the destination's backend. - - The cache is configured to point at the same backend the destination - wrote to, using the supplied namespace as the schema. - - Raises: - NotImplementedError: If the destination is not supported. - """ - from airbyte.destinations._translate_dest_to_cache import ( # noqa: PLC0415 - destination_to_cache, - ) - - if destination_name not in SUPPORTED_DESTINATIONS: - raise NotImplementedError( - f"Readback is not supported for '{destination_name}'. " - f"Supported destinations: {sorted(SUPPORTED_DESTINATIONS)}" - ) - - # The destination_to_cache function expects the config to have - # a 'destinationType' field. We ensure it's present. - config_with_type = dict(destination_config) - if "destinationType" not in config_with_type and "DESTINATION_TYPE" not in config_with_type: - # Infer the type from the destination name - dest_type = destination_name.replace("destination-", "") - config_with_type["destinationType"] = dest_type - - cache = destination_to_cache(config_with_type) - - # Override the schema to match the namespace used by the smoke test - if hasattr(cache, "schema_name"): - # Use model_copy to create a new instance with updated schema - cache = cache.model_copy(update={"schema_name": namespace}) - - return cache - - def run_destination_readback( *, - destination_name: str, - destination_config: dict[str, Any], + destination: Destination, namespace: str, stream_names: list[str], ) -> DestinationReadbackResult: """Read back data from a destination after a smoke test and produce stats. This is the main entry point for readback introspection. It: - 1. Constructs a cache that can query the destination's backend + 1. Builds a cache via ``destination.get_sql_cache()`` (same pattern as + :pymethod:`airbyte.cloud.sync_results.SyncResult.get_sql_cache`) 2. For each expected stream, resolves the expected table name using the cache's built-in processor normalizer 3. Queries row counts, column info, and column stats via cache methods @@ -259,26 +180,22 @@ def run_destination_readback( If the destination is not supported for readback, returns a result with ``readback_supported=False`` and empty data. """ - if not _get_readback_supported(destination_name): + try: + cache = destination.get_sql_cache(schema_name=namespace) + except ValueError: + # destination_to_cache raises ValueError for unsupported types return DestinationReadbackResult( - destination=destination_name, + destination=destination.name, namespace=namespace, readback_supported=False, tables=[], table_reports=[], tables_missing=stream_names, ) - - try: - cache = _build_readback_cache( - destination_name=destination_name, - destination_config=destination_config, - namespace=namespace, - ) except Exception as ex: - logger.warning("Failed to build readback cache for %s: %s", destination_name, ex) + logger.warning("Failed to build readback cache for %s: %s", destination.name, ex) return DestinationReadbackResult( - destination=destination_name, + destination=destination.name, namespace=namespace, readback_supported=True, tables=[], @@ -351,7 +268,7 @@ def run_destination_readback( ) return DestinationReadbackResult( - destination=destination_name, + destination=destination.name, namespace=namespace, readback_supported=True, tables=tables, diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index bb65c3d77..817bd2809 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -306,10 +306,8 @@ def run_destination_smoke_test( readback_result: DestinationReadbackResult | None = None if success: try: - destination_config = destination.get_config() readback_result = run_destination_readback( - destination_name=destination.name, - destination_config=destination_config, + destination=destination, namespace=namespace, stream_names=stream_names, ) diff --git a/airbyte/destinations/base.py b/airbyte/destinations/base.py index ce391e109..ddaf6a3ef 100644 --- a/airbyte/destinations/base.py +++ b/airbyte/destinations/base.py @@ -35,6 +35,9 @@ from airbyte.shared.state_writers import StateWriterBase +_CANONICAL_PREFIX = "destination-" + + class Destination(ConnectorBase, AirbyteWriterInterface): """A class representing a destination that can be called.""" @@ -61,6 +64,77 @@ def __init__( validate=validate, ) + @staticmethod + def _normalize_destination_name(name: str) -> str: + """Normalize a destination name to canonical form (``destination-``). + + Accepts either the short form (e.g. ``snowflake``) or the canonical + form (e.g. ``destination-snowflake``). + """ + if not name.startswith(_CANONICAL_PREFIX): + return f"{_CANONICAL_PREFIX}{name}" + return name + + def get_sql_cache( + self, + *, + schema_name: str | None = None, + destination_name: str | None = None, + destination_config: dict[str, Any] | None = None, + version: str | None = None, + ) -> CacheBase: + """Return a SQL Cache for querying data written by this destination. + + This follows the same pattern as + :pymethod:`airbyte.cloud.sync_results.SyncResult.get_sql_cache`: + it builds a cache from the destination's configuration using + ``destination_to_cache()``. + + Args: + schema_name: Override the schema/namespace on the returned cache. + When ``None`` the cache uses the default schema from the + destination config. + destination_name: The canonical destination connector name + (e.g. ``destination-snowflake`` or ``snowflake``). When + ``None``, ``self.name`` is used. + destination_config: The destination configuration dict. When + ``None``, ``self.get_config()`` is used. + version: Destination version string. Currently only ``"latest"`` + (or ``None``, which is treated as ``"latest"``) is accepted. + Any other value raises ``NotImplementedError``. + + Raises: + NotImplementedError: If *version* is not ``"latest"`` or ``None``. + ValueError: If the destination type is not supported. + """ + from airbyte.destinations._translate_dest_to_cache import ( # noqa: PLC0415 + destination_to_cache, + ) + + # Version gate - future-proof the signature. + if version is not None and version != "latest": + raise NotImplementedError( + f"Only version='latest' (or None) is currently supported. " f"Got: {version!r}" + ) + + resolved_name = self._normalize_destination_name( + destination_name or self.name, + ) + config = dict(destination_config or self.get_config()) + + # Ensure the config carries a destinationType key so that + # destination_to_cache() can dispatch correctly. + if "destinationType" not in config and "DESTINATION_TYPE" not in config: + dest_type = resolved_name.replace(_CANONICAL_PREFIX, "") + config["destinationType"] = dest_type + + cache: CacheBase = destination_to_cache(config) + + if schema_name is not None: + cache = cache.model_copy(update={"schema_name": schema_name}) + + return cache + def write( # noqa: PLR0912, PLR0915 # Too many arguments/statements self, source_data: Source | ReadResult, From 6fa7180e48bf12c87741b2847fea2532474c79d9 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 02:05:29 +0000 Subject: [PATCH 06/49] fix: case-insensitive key lookup for column stats across DBs Co-Authored-By: AJ Steers --- airbyte/caches/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index 21255570d..4551632be 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -549,7 +549,9 @@ def _readback_query_column_stats( for col in columns: col_name = col["column_name"] non_null_key = f"non_null_{col_name}" - non_null_count = int(row.get(non_null_key, 0)) + # Try original case first, then lowercase for DBs that normalize + # result keys (e.g. PostgreSQL lowercases unquoted identifiers). + non_null_count = int(row.get(non_null_key) or row.get(non_null_key.lower(), 0)) stats.append( { "column_name": col_name, From 4885a167b72b304add138382f31e537030bf2748 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 06:38:34 +0000 Subject: [PATCH 07/49] refactor: eliminate destination_readback.py, fix key casing and secret hydration bugs - Remove destination_readback.py entirely per AJ's review - Move readback models into destination_smoke_tests.py (only consumer) - Add _readback_get_table_report() and _readback_get_results() to CacheBase to own the full orchestration loop (eliminates SLF001 suppressions) - Inline readback orchestration in run_destination_smoke_test() - Fix: case-insensitive row key lookup via casefold map (Snowflake/PG) - Fix: use 'is not None' instead of truthy check for 0-valued counts - Fix: use self._hydrated_config instead of self.get_config() to resolve secret references in destination config Co-Authored-By: AJ Steers --- airbyte/_util/destination_readback.py | 277 ----------------------- airbyte/_util/destination_smoke_tests.py | 162 ++++++++++++- airbyte/caches/base.py | 80 ++++++- airbyte/destinations/base.py | 2 +- 4 files changed, 229 insertions(+), 292 deletions(-) delete mode 100644 airbyte/_util/destination_readback.py diff --git a/airbyte/_util/destination_readback.py b/airbyte/_util/destination_readback.py deleted file mode 100644 index 723a9d36f..000000000 --- a/airbyte/_util/destination_readback.py +++ /dev/null @@ -1,277 +0,0 @@ -# Copyright (c) 2024 Airbyte, Inc., all rights reserved. -"""Destination readback introspection for smoke tests. - -This module provides the ability to read back data written by a destination -connector and produce stats-level reports: table row counts, column names -and types, and per-column null/non-null counts. - -The readback leverages PyAirbyte's existing cache implementations to query -the same backends that destinations write to. Table and column name -normalization is delegated to the cache's built-in SQL processor normalizer. -""" - -from __future__ import annotations - -import logging -from typing import TYPE_CHECKING, Any - -from pydantic import BaseModel - - -if TYPE_CHECKING: - from airbyte.destinations.base import Destination - -logger = logging.getLogger(__name__) - - -# --------------------------------------------------------------------------- -# Readback result models -# --------------------------------------------------------------------------- - - -class ColumnStats(BaseModel): - """Null/non-null statistics for a single column.""" - - column_name: str - """The column name as found in the destination.""" - - null_count: int - """Number of NULL values in this column.""" - - non_null_count: int - """Number of non-NULL values in this column.""" - - total_count: int - """Total row count (null_count + non_null_count).""" - - -class ColumnInfo(BaseModel): - """Column name and type information.""" - - column_name: str - """The column name as found in the destination.""" - - column_type: str - """The SQL data type name as reported by the database.""" - - -class TableInfo(BaseModel): - """Basic table info: name and row count.""" - - table_name: str - """The table name as found in the destination.""" - - row_count: int - """Number of rows in the table.""" - - expected_stream_name: str - """The original stream name that this table corresponds to.""" - - -class TableReadbackReport(BaseModel): - """Full readback report for a single table.""" - - table_name: str - """The table name as found in the destination.""" - - expected_stream_name: str - """The original stream name.""" - - row_count: int - """Number of rows found.""" - - columns: list[ColumnInfo] - """Column names and types.""" - - column_stats: list[ColumnStats] - """Per-column null/non-null statistics.""" - - -class DestinationReadbackResult(BaseModel): - """Result of reading back destination-written data. - - Contains three logical datasets: - 1. tables - list of tables with row counts - 2. columns - per-table column names and types - 3. column_stats - per-table, per-column null/non-null counts - """ - - destination: str - """The destination connector name.""" - - namespace: str - """The namespace (schema) that was inspected.""" - - readback_supported: bool - """Whether readback was supported for this destination.""" - - tables: list[TableInfo] - """Dataset 1: Tables found with row counts.""" - - table_reports: list[TableReadbackReport] - """Full per-table reports including columns and stats.""" - - tables_missing: list[str] - """Stream names for which the expected table was not found.""" - - error: str | None = None - """Error message if readback failed.""" - - def get_tables_summary(self) -> list[dict[str, Any]]: - """Return dataset 1: tables with row counts as plain dicts.""" - return [t.model_dump() for t in self.tables] - - def get_columns_summary(self) -> list[dict[str, Any]]: - """Return dataset 2: columns with types, grouped by table.""" - result = [] - for report in self.table_reports: - result.extend( - { - "table_name": report.table_name, - "column_name": col.column_name, - "column_type": col.column_type, - } - for col in report.columns - ) - return result - - def get_column_stats_summary(self) -> list[dict[str, Any]]: - """Return dataset 3: per-column null/non-null counts.""" - result = [] - for report in self.table_reports: - result.extend( - { - "table_name": report.table_name, - "column_name": stat.column_name, - "null_count": stat.null_count, - "non_null_count": stat.non_null_count, - "total_count": stat.total_count, - } - for stat in report.column_stats - ) - return result - - -# --------------------------------------------------------------------------- -# Readback orchestration -# --------------------------------------------------------------------------- - - -def run_destination_readback( - *, - destination: Destination, - namespace: str, - stream_names: list[str], -) -> DestinationReadbackResult: - """Read back data from a destination after a smoke test and produce stats. - - This is the main entry point for readback introspection. It: - 1. Builds a cache via ``destination.get_sql_cache()`` (same pattern as - :pymethod:`airbyte.cloud.sync_results.SyncResult.get_sql_cache`) - 2. For each expected stream, resolves the expected table name using the - cache's built-in processor normalizer - 3. Queries row counts, column info, and column stats via cache methods - - Returns a ``DestinationReadbackResult`` with three datasets: - - tables: table names + row counts - - columns: column names + types per table - - column_stats: null/non-null counts per column per table - - If the destination is not supported for readback, returns a result - with ``readback_supported=False`` and empty data. - """ - try: - cache = destination.get_sql_cache(schema_name=namespace) - except ValueError: - # destination_to_cache raises ValueError for unsupported types - return DestinationReadbackResult( - destination=destination.name, - namespace=namespace, - readback_supported=False, - tables=[], - table_reports=[], - tables_missing=stream_names, - ) - except Exception as ex: - logger.warning("Failed to build readback cache for %s: %s", destination.name, ex) - return DestinationReadbackResult( - destination=destination.name, - namespace=namespace, - readback_supported=True, - tables=[], - table_reports=[], - tables_missing=stream_names, - error=f"Failed to build readback cache: {ex}", - ) - - tables: list[TableInfo] = [] - table_reports: list[TableReadbackReport] = [] - tables_missing: list[str] = [] - - for stream_name in stream_names: - # Use the cache's processor normalizer to resolve the expected table name. - # This delegates to the same normalizer the cache uses for writing. - expected_table = cache.processor.get_sql_table_name(stream_name) - - # Optimistic: try to query the expected table directly - row_count = cache._readback_query_row_count( # noqa: SLF001 - expected_table, - ) - - if row_count is None: - tables_missing.append(stream_name) - continue - - tables.append( - TableInfo( - table_name=expected_table, - row_count=row_count, - expected_stream_name=stream_name, - ) - ) - - # Get column info via cache method - raw_columns = cache._readback_query_column_info( # noqa: SLF001 - expected_table, - ) - columns = [ - ColumnInfo( - column_name=c["column_name"], - column_type=c["column_type"], - ) - for c in raw_columns - ] - - # Get column stats via cache method - raw_stats = cache._readback_query_column_stats( # noqa: SLF001 - expected_table, - raw_columns, - ) - column_stats = [ - ColumnStats( - column_name=s["column_name"], - null_count=s["null_count"], - non_null_count=s["non_null_count"], - total_count=s["total_count"], - ) - for s in raw_stats - ] - - table_reports.append( - TableReadbackReport( - table_name=expected_table, - expected_stream_name=stream_name, - row_count=row_count, - columns=columns, - column_stats=column_stats, - ) - ) - - return DestinationReadbackResult( - destination=destination.name, - namespace=namespace, - readback_supported=True, - tables=tables, - table_reports=table_reports, - tables_missing=tables_missing, - ) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index 817bd2809..7224d1d8f 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -27,10 +27,6 @@ from pydantic import BaseModel from airbyte import get_source -from airbyte._util.destination_readback import ( - DestinationReadbackResult, - run_destination_readback, -) from airbyte.exceptions import PyAirbyteInputError @@ -72,6 +68,139 @@ def generate_namespace( return f"{NAMESPACE_PREFIX}_{ts}_{suffix}" +# --------------------------------------------------------------------------- +# Readback result models +# --------------------------------------------------------------------------- + + +class ColumnStats(BaseModel): + """Null/non-null statistics for a single column.""" + + column_name: str + """The column name as found in the destination.""" + + null_count: int + """Number of NULL values in this column.""" + + non_null_count: int + """Number of non-NULL values in this column.""" + + total_count: int + """Total row count (null_count + non_null_count).""" + + +class ColumnInfo(BaseModel): + """Column name and type information.""" + + column_name: str + """The column name as found in the destination.""" + + column_type: str + """The SQL data type name as reported by the database.""" + + +class TableInfo(BaseModel): + """Basic table info: name and row count.""" + + table_name: str + """The table name as found in the destination.""" + + row_count: int + """Number of rows in the table.""" + + expected_stream_name: str + """The original stream name that this table corresponds to.""" + + +class TableReadbackReport(BaseModel): + """Full readback report for a single table.""" + + table_name: str + """The table name as found in the destination.""" + + expected_stream_name: str + """The original stream name.""" + + row_count: int + """Number of rows found.""" + + columns: list[ColumnInfo] + """Column names and types.""" + + column_stats: list[ColumnStats] + """Per-column null/non-null statistics.""" + + +class DestinationReadbackResult(BaseModel): + """Result of reading back destination-written data. + + Contains three logical datasets: + 1. tables - list of tables with row counts + 2. columns - per-table column names and types + 3. column_stats - per-table, per-column null/non-null counts + """ + + destination: str + """The destination connector name.""" + + namespace: str + """The namespace (schema) that was inspected.""" + + readback_supported: bool + """Whether readback was supported for this destination.""" + + tables: list[TableInfo] + """Dataset 1: Tables found with row counts.""" + + table_reports: list[TableReadbackReport] + """Full per-table reports including columns and stats.""" + + tables_missing: list[str] + """Stream names for which the expected table was not found.""" + + error: str | None = None + """Error message if readback failed.""" + + def get_tables_summary(self) -> list[dict[str, Any]]: + """Return dataset 1: tables with row counts as plain dicts.""" + return [t.model_dump() for t in self.tables] + + def get_columns_summary(self) -> list[dict[str, Any]]: + """Return dataset 2: columns with types, grouped by table.""" + result = [] + for report in self.table_reports: + result.extend( + { + "table_name": report.table_name, + "column_name": col.column_name, + "column_type": col.column_type, + } + for col in report.columns + ) + return result + + def get_column_stats_summary(self) -> list[dict[str, Any]]: + """Return dataset 3: per-column null/non-null counts.""" + result = [] + for report in self.table_reports: + result.extend( + { + "table_name": report.table_name, + "column_name": stat.column_name, + "null_count": stat.null_count, + "non_null_count": stat.non_null_count, + "total_count": stat.total_count, + } + for stat in report.column_stats + ) + return result + + +# --------------------------------------------------------------------------- +# Smoke test result model +# --------------------------------------------------------------------------- + + class DestinationSmokeTestResult(BaseModel): """Result of a destination smoke test run.""" @@ -306,12 +435,27 @@ def run_destination_smoke_test( readback_result: DestinationReadbackResult | None = None if success: try: - readback_result = run_destination_readback( - destination=destination, + cache = destination.get_sql_cache(schema_name=namespace) + raw = cache._readback_get_results(stream_names) # noqa: SLF001 + readback_result = DestinationReadbackResult( + destination=destination.name, namespace=namespace, - stream_names=stream_names, + readback_supported=True, + tables=[TableInfo(**t) for t in raw["tables"]], + table_reports=[ + TableReadbackReport( + table_name=r["table_name"], + expected_stream_name=r["expected_stream_name"], + row_count=r["row_count"], + columns=[ColumnInfo(**c) for c in r["columns"]], + column_stats=[ColumnStats(**s) for s in r["column_stats"]], + ) + for r in raw["table_reports"] + ], + tables_missing=raw["tables_missing"], ) - except NotImplementedError: + except ValueError: + # destination_to_cache raises ValueError for unsupported types logger.info( "Readback not supported for destination '%s'.", destination.name, @@ -320,7 +464,7 @@ def run_destination_smoke_test( logger.warning( "Readback failed for destination '%s': %s", destination.name, - ex, + _sanitize_error(ex), ) return DestinationSmokeTestResult( diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index 4551632be..9922118a2 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -543,15 +543,17 @@ def _readback_query_column_stats( return [] row = result[0] - total_rows = int(row["total_rows"]) + # Case-insensitive key lookup: some DBs uppercase (Snowflake) or + # lowercase (PostgreSQL) unquoted SQL aliases. + row_ci = {k.lower(): v for k, v in row.items()} + total_rows = int(row_ci.get("total_rows", 0)) stats = [] for col in columns: col_name = col["column_name"] - non_null_key = f"non_null_{col_name}" - # Try original case first, then lowercase for DBs that normalize - # result keys (e.g. PostgreSQL lowercases unquoted identifiers). - non_null_count = int(row.get(non_null_key) or row.get(non_null_key.lower(), 0)) + non_null_key = f"non_null_{col_name}".lower() + val = row_ci.get(non_null_key) + non_null_count = int(val) if val is not None else 0 stats.append( { "column_name": col_name, @@ -563,6 +565,74 @@ def _readback_query_column_stats( return stats + def _readback_get_table_report( + self, + table_name: str, + stream_name: str, + ) -> dict[str, Any] | None: + """Build a full readback report for a single table. + + Composes ``_readback_query_row_count``, ``_readback_query_column_info``, + and ``_readback_query_column_stats`` into a single dict suitable for + constructing a ``TableReadbackReport``. + + Returns ``None`` when the table does not exist (row count is ``None``). + """ + row_count = self._readback_query_row_count(table_name) + if row_count is None: + return None + + raw_columns = self._readback_query_column_info(table_name) + raw_stats = self._readback_query_column_stats(table_name, raw_columns) + + return { + "table_name": table_name, + "expected_stream_name": stream_name, + "row_count": row_count, + "columns": raw_columns, + "column_stats": raw_stats, + } + + def _readback_get_results( + self, + stream_names: list[str], + ) -> dict[str, Any]: + """Run readback introspection for the given streams. + + For each stream, resolves the expected table name via the cache's + built-in processor normalizer, then queries row counts, column info, + and column stats. + + Returns a dict with keys ``tables``, ``table_reports``, and + ``tables_missing`` ready for constructing a result model. + """ + tables: list[dict[str, Any]] = [] + table_reports: list[dict[str, Any]] = [] + tables_missing: list[str] = [] + + for stream_name in stream_names: + expected_table = self.processor.get_sql_table_name(stream_name) + report = self._readback_get_table_report(expected_table, stream_name) + + if report is None: + tables_missing.append(stream_name) + continue + + tables.append( + { + "table_name": report["table_name"], + "row_count": report["row_count"], + "expected_stream_name": stream_name, + } + ) + table_reports.append(report) + + return { + "tables": tables, + "table_reports": table_reports, + "tables_missing": tables_missing, + } + def _write_airbyte_message_stream( self, stdin: IO[str] | AirbyteMessageIterator, diff --git a/airbyte/destinations/base.py b/airbyte/destinations/base.py index ddaf6a3ef..f526c5118 100644 --- a/airbyte/destinations/base.py +++ b/airbyte/destinations/base.py @@ -120,7 +120,7 @@ def get_sql_cache( resolved_name = self._normalize_destination_name( destination_name or self.name, ) - config = dict(destination_config or self.get_config()) + config = dict(destination_config or self._hydrated_config) # Ensure the config carries a destinationType key so that # destination_to_cache() can dispatch correctly. From fffbceadb2d1e68bd06da8a01806afea9d9eecc2 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 06:41:05 +0000 Subject: [PATCH 08/49] style: convert docstring from reST to markdown syntax Co-Authored-By: AJ Steers --- airbyte/destinations/base.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/airbyte/destinations/base.py b/airbyte/destinations/base.py index f526c5118..d66d91762 100644 --- a/airbyte/destinations/base.py +++ b/airbyte/destinations/base.py @@ -86,25 +86,25 @@ def get_sql_cache( """Return a SQL Cache for querying data written by this destination. This follows the same pattern as - :pymethod:`airbyte.cloud.sync_results.SyncResult.get_sql_cache`: + `SyncResult.get_sql_cache()` in `airbyte.cloud.sync_results`: it builds a cache from the destination's configuration using - ``destination_to_cache()``. + `destination_to_cache()`. Args: schema_name: Override the schema/namespace on the returned cache. - When ``None`` the cache uses the default schema from the + When `None` the cache uses the default schema from the destination config. destination_name: The canonical destination connector name - (e.g. ``destination-snowflake`` or ``snowflake``). When - ``None``, ``self.name`` is used. + (e.g. `destination-snowflake` or `snowflake`). When + `None`, `self.name` is used. destination_config: The destination configuration dict. When - ``None``, ``self.get_config()`` is used. - version: Destination version string. Currently only ``"latest"`` - (or ``None``, which is treated as ``"latest"``) is accepted. - Any other value raises ``NotImplementedError``. + `None`, `self.get_config()` is used. + version: Destination version string. Currently only `"latest"` + (or `None`, which is treated as `"latest"`) is accepted. + Any other value raises `NotImplementedError`. Raises: - NotImplementedError: If *version* is not ``"latest"`` or ``None``. + NotImplementedError: If `version` is not `"latest"` or `None`. ValueError: If the destination type is not supported. """ from airbyte.destinations._translate_dest_to_cache import ( # noqa: PLC0415 From cda5d8147d9063515e624a8fd798215fd8e78b01 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 06:50:02 +0000 Subject: [PATCH 09/49] fix: use positional aliases in column stats query to avoid truncation PostgreSQL truncates unquoted identifiers at 63 chars. Using positional aliases (nn_0, nn_1, ...) instead of non_null_{col_name} avoids mismatches when column names are long. Co-Authored-By: AJ Steers --- airbyte/caches/base.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index 9922118a2..ea481b5b9 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -517,11 +517,13 @@ def _readback_query_column_stats( # Build a SQL query that computes COUNT(*), COUNT(col) for each column. # COUNT(*) gives total rows, COUNT(col) gives non-null count. + # We use positional aliases (nn_0, nn_1, ...) to avoid issues with + # databases that truncate long identifiers (PostgreSQL: 63 chars). count_exprs: list[str] = [] - for col in columns: + for idx, col in enumerate(columns): col_name = col["column_name"] quoted = f'"{col_name}"' - count_exprs.append(f"COUNT({quoted}) AS non_null_{col_name}") + count_exprs.append(f"COUNT({quoted}) AS nn_{idx}") count_exprs_str = ", ".join(count_exprs) sql = ( @@ -549,9 +551,9 @@ def _readback_query_column_stats( total_rows = int(row_ci.get("total_rows", 0)) stats = [] - for col in columns: + for idx, col in enumerate(columns): col_name = col["column_name"] - non_null_key = f"non_null_{col_name}".lower() + non_null_key = f"nn_{idx}" val = row_ci.get(non_null_key) non_null_count = int(val) if val is not None else 0 stats.append( From d75b0036d1ee22e3754937f9d99d928894af0f9c Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 06:55:55 +0000 Subject: [PATCH 10/49] fix: quote table names in readback SQL queries for Snowflake compatibility Co-Authored-By: AJ Steers --- airbyte/caches/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index ea481b5b9..d052c9a3b 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -452,7 +452,7 @@ def _readback_query_row_count( try: result = self.run_sql_query( - f"SELECT COUNT(*) AS row_count FROM {self.schema_name}.{table_name}", + f'SELECT COUNT(*) AS row_count FROM {self.schema_name}."{table_name}"', ) if result: return int(result[0]["row_count"]) @@ -528,7 +528,7 @@ def _readback_query_column_stats( count_exprs_str = ", ".join(count_exprs) sql = ( f"SELECT COUNT(*) AS total_rows, {count_exprs_str} " - f"FROM {self.schema_name}.{table_name}" + f'FROM {self.schema_name}."{table_name}"' ) try: From 8ce95e42dd5c960ccece6aaa02415a88d9d6e165 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 06:58:41 +0000 Subject: [PATCH 11/49] fix: handle dict config in postgres_destination_to_cache for local readback Co-Authored-By: AJ Steers --- airbyte/destinations/_translate_dest_to_cache.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/airbyte/destinations/_translate_dest_to_cache.py b/airbyte/destinations/_translate_dest_to_cache.py index 8f7055708..de6ef46a5 100644 --- a/airbyte/destinations/_translate_dest_to_cache.py +++ b/airbyte/destinations/_translate_dest_to_cache.py @@ -119,9 +119,18 @@ def motherduck_destination_to_cache( def postgres_destination_to_cache( - destination_configuration: DestinationPostgres, + destination_configuration: DestinationPostgres | dict[str, Any], ) -> PostgresCache: """Create a new Postgres cache from the destination configuration.""" + if isinstance(destination_configuration, dict): + # Strip dispatch keys before constructing the model object. + filtered = { + k: v + for k, v in destination_configuration.items() + if k not in {"destinationType", "DESTINATION_TYPE"} + } + destination_configuration = DestinationPostgres(**filtered) + port: int = int(destination_configuration.port) if destination_configuration.port else 5432 if not destination_configuration.password: raise ValueError("Password is required for Postgres cache.") From 2b7d3ce9d5c2b2327567d04e46e27504fc4da0c4 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 07:05:15 +0000 Subject: [PATCH 12/49] fix: case-insensitive row_count lookup + fallback to original stream name for CamelCase tables Co-Authored-By: AJ Steers --- airbyte/caches/base.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index d052c9a3b..45339041b 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -455,7 +455,8 @@ def _readback_query_row_count( f'SELECT COUNT(*) AS row_count FROM {self.schema_name}."{table_name}"', ) if result: - return int(result[0]["row_count"]) + row_ci = {k.lower(): v for k, v in result[0].items()} + return int(row_ci.get("row_count", 0)) return 0 # noqa: TRY300 except Exception: logging.getLogger(__name__).debug( @@ -616,6 +617,13 @@ def _readback_get_results( expected_table = self.processor.get_sql_table_name(stream_name) report = self._readback_get_table_report(expected_table, stream_name) + # Fallback: if the normalized name isn't found, try the + # original stream name. Some destinations preserve the + # original casing (e.g. "CamelCaseStreamName") while the + # cache normalizer lowercases it. + if report is None and expected_table != stream_name: + report = self._readback_get_table_report(stream_name, stream_name) + if report is None: tables_missing.append(stream_name) continue From 40944f959d0f448de16f055a0f778f071d6971d4 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 07:12:28 +0000 Subject: [PATCH 13/49] fix: move inline imports to top-level in CacheBase readback methods Co-Authored-By: AJ Steers --- airbyte/caches/base.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index 45339041b..f9eb82430 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -4,6 +4,7 @@ from __future__ import annotations import contextlib +import logging from pathlib import Path from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, final @@ -11,6 +12,7 @@ import pyarrow as pa import pyarrow.dataset as ds from pydantic import Field, PrivateAttr +import sqlalchemy as sa from sqlalchemy import exc as sqlalchemy_exc from sqlalchemy import text from typing_extensions import Self @@ -448,8 +450,6 @@ def _readback_query_row_count( table_name: str, ) -> int | None: """Query the row count for a table. Returns None if the table doesn't exist.""" - import logging # noqa: PLC0415 - try: result = self.run_sql_query( f'SELECT COUNT(*) AS row_count FROM {self.schema_name}."{table_name}"', @@ -474,10 +474,6 @@ def _readback_query_column_info( Returns a list of dicts with 'column_name' and 'column_type' keys. """ - import logging # noqa: PLC0415 - - import sqlalchemy as sa # noqa: PLC0415 - try: engine = self.get_sql_engine() inspector = sa.inspect(engine) @@ -511,8 +507,6 @@ def _readback_query_column_stats( Returns a list of dicts with column_name, null_count, non_null_count, total_count keys. """ - import logging # noqa: PLC0415 - if not columns: return [] From 08f1b4c3d89da4ccaa2d8749a844ed8ccefebc73 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 07:15:59 +0000 Subject: [PATCH 14/49] fix: import sorting in base.py + handle dict config and /local path in duckdb_destination_to_cache Co-Authored-By: AJ Steers --- airbyte/caches/base.py | 2 +- .../destinations/_translate_dest_to_cache.py | 24 +++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index f9eb82430..e446021ba 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -11,8 +11,8 @@ import pandas as pd import pyarrow as pa import pyarrow.dataset as ds -from pydantic import Field, PrivateAttr import sqlalchemy as sa +from pydantic import Field, PrivateAttr from sqlalchemy import exc as sqlalchemy_exc from sqlalchemy import text from typing_extensions import Self diff --git a/airbyte/destinations/_translate_dest_to_cache.py b/airbyte/destinations/_translate_dest_to_cache.py index de6ef46a5..bebfefdd6 100644 --- a/airbyte/destinations/_translate_dest_to_cache.py +++ b/airbyte/destinations/_translate_dest_to_cache.py @@ -95,11 +95,31 @@ def bigquery_destination_to_cache( def duckdb_destination_to_cache( - destination_configuration: DestinationDuckdb, + destination_configuration: DestinationDuckdb | dict[str, Any], ) -> DuckDBCache: """Create a new DuckDB cache from the destination configuration.""" + if isinstance(destination_configuration, dict): + filtered = { + k: v + for k, v in destination_configuration.items() + if k not in {"destinationType", "DESTINATION_TYPE"} + } + destination_configuration = DestinationDuckdb(**filtered) + + db_path = destination_configuration.destination_path + + # The DuckDB destination Docker container mounts a host directory to + # ``/local`` inside the container. Paths written as ``/local/foo.duckdb`` + # actually live at ``/destination-duckdb/foo.duckdb`` on the + # host. Resolve the host-side path so the cache can open the file. + if db_path.startswith(("/local/", "/local\\")): + from airbyte.constants import DEFAULT_PROJECT_DIR # noqa: PLC0415 + + host_path = str(DEFAULT_PROJECT_DIR / "destination-duckdb" / db_path[len("/local/"):]) + db_path = host_path + return DuckDBCache( - db_path=destination_configuration.destination_path, + db_path=db_path, schema_name=destination_configuration.schema or "main", ) From 1cdd9b0f66e6526c0f95bc693af6d624b6fae96e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 07:16:50 +0000 Subject: [PATCH 15/49] style: fix ruff format in duckdb_destination_to_cache Co-Authored-By: AJ Steers --- airbyte/destinations/_translate_dest_to_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte/destinations/_translate_dest_to_cache.py b/airbyte/destinations/_translate_dest_to_cache.py index bebfefdd6..754752d37 100644 --- a/airbyte/destinations/_translate_dest_to_cache.py +++ b/airbyte/destinations/_translate_dest_to_cache.py @@ -115,7 +115,7 @@ def duckdb_destination_to_cache( if db_path.startswith(("/local/", "/local\\")): from airbyte.constants import DEFAULT_PROJECT_DIR # noqa: PLC0415 - host_path = str(DEFAULT_PROJECT_DIR / "destination-duckdb" / db_path[len("/local/"):]) + host_path = str(DEFAULT_PROJECT_DIR / "destination-duckdb" / db_path[len("/local/") :]) db_path = host_path return DuckDBCache( From f07a2236fb0c4f8cab49eb5890cf760116817d2e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 07:22:41 +0000 Subject: [PATCH 16/49] fix: handle dict config in motherduck_destination_to_cache Co-Authored-By: AJ Steers --- airbyte/destinations/_translate_dest_to_cache.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/airbyte/destinations/_translate_dest_to_cache.py b/airbyte/destinations/_translate_dest_to_cache.py index 754752d37..6ee780602 100644 --- a/airbyte/destinations/_translate_dest_to_cache.py +++ b/airbyte/destinations/_translate_dest_to_cache.py @@ -125,9 +125,17 @@ def duckdb_destination_to_cache( def motherduck_destination_to_cache( - destination_configuration: DestinationDuckdb, + destination_configuration: DestinationDuckdb | dict[str, Any], ) -> MotherDuckCache: - """Create a new DuckDB cache from the destination configuration.""" + """Create a new MotherDuck cache from the destination configuration.""" + if isinstance(destination_configuration, dict): + filtered = { + k: v + for k, v in destination_configuration.items() + if k not in {"destinationType", "DESTINATION_TYPE"} + } + destination_configuration = DestinationDuckdb(**filtered) + if not destination_configuration.motherduck_api_key: raise ValueError("MotherDuck API key is required for MotherDuck cache.") From 76151ab2e1505a2560fb1fba1a5cf0dcf861cc33 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 07:32:50 +0000 Subject: [PATCH 17/49] fix: strip destinationType key in bigquery and snowflake dest-to-cache converters Co-Authored-By: AJ Steers --- airbyte/destinations/_translate_dest_to_cache.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/airbyte/destinations/_translate_dest_to_cache.py b/airbyte/destinations/_translate_dest_to_cache.py index 6ee780602..dd3cefcb0 100644 --- a/airbyte/destinations/_translate_dest_to_cache.py +++ b/airbyte/destinations/_translate_dest_to_cache.py @@ -84,7 +84,12 @@ def bigquery_destination_to_cache( """ credentials_path = get_secret("BIGQUERY_CREDENTIALS_PATH") if isinstance(destination_configuration, dict): - destination_configuration = DestinationBigquery(**destination_configuration) + filtered = { + k: v + for k, v in destination_configuration.items() + if k not in {"destinationType", "DESTINATION_TYPE"} + } + destination_configuration = DestinationBigquery(**filtered) return BigQueryCache( project_name=destination_configuration.project_id, @@ -183,7 +188,12 @@ def snowflake_destination_to_cache( is returned from the REST API. """ if isinstance(destination_configuration, dict): - destination_configuration = DestinationSnowflake(**destination_configuration) + filtered = { + k: v + for k, v in destination_configuration.items() + if k not in {"destinationType", "DESTINATION_TYPE"} + } + destination_configuration = DestinationSnowflake(**filtered) snowflake_password: str | None = None if ( From 08b64e439e4799b5b05f4332bc85bc290d26bb4c Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 07:53:58 +0000 Subject: [PATCH 18/49] fix: use dialect-aware identifier quoting in readback SQL (BigQuery backticks) Co-Authored-By: AJ Steers --- airbyte/caches/base.py | 16 +++++++++++++--- airbyte/caches/bigquery.py | 4 ++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index e446021ba..06d138e19 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -445,14 +445,24 @@ def __iter__( # type: ignore [override] # Overriding Pydantic model method # These private methods support destination readback introspection, # allowing the cache to query stats about data written by a destination. + def _readback_quote_identifier(self, identifier: str) -> str: + """Quote an identifier for use in readback SQL queries. + + Defaults to ANSI double-quote style. Subclasses whose SQL dialect + uses a different quoting convention (e.g. BigQuery backticks) should + override this method. + """ + return f'"{identifier}"' + def _readback_query_row_count( self, table_name: str, ) -> int | None: """Query the row count for a table. Returns None if the table doesn't exist.""" try: + quoted_table = self._readback_quote_identifier(table_name) result = self.run_sql_query( - f'SELECT COUNT(*) AS row_count FROM {self.schema_name}."{table_name}"', + f"SELECT COUNT(*) AS row_count FROM {self.schema_name}.{quoted_table}", ) if result: row_ci = {k.lower(): v for k, v in result[0].items()} @@ -517,13 +527,13 @@ def _readback_query_column_stats( count_exprs: list[str] = [] for idx, col in enumerate(columns): col_name = col["column_name"] - quoted = f'"{col_name}"' + quoted = self._readback_quote_identifier(col_name) count_exprs.append(f"COUNT({quoted}) AS nn_{idx}") count_exprs_str = ", ".join(count_exprs) sql = ( f"SELECT COUNT(*) AS total_rows, {count_exprs_str} " - f'FROM {self.schema_name}."{table_name}"' + f"FROM {self.schema_name}.{self._readback_quote_identifier(table_name)}" ) try: diff --git a/airbyte/caches/bigquery.py b/airbyte/caches/bigquery.py index a6aaf71e1..eec898155 100644 --- a/airbyte/caches/bigquery.py +++ b/airbyte/caches/bigquery.py @@ -43,6 +43,10 @@ class BigQueryCache(BigQueryConfig, CacheBase): paired_destination_name: ClassVar[str | None] = "destination-bigquery" paired_destination_config_class: ClassVar[type | None] = DestinationBigquery + def _readback_quote_identifier(self, identifier: str) -> str: + """BigQuery uses backticks instead of ANSI double-quotes.""" + return f"`{identifier}`" + @property def paired_destination_config(self) -> DestinationBigquery: """Return a dictionary of destination configuration values.""" From 73993fabcc10f0f217cb3556243323d8a4cd6e55 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 16:44:19 +0000 Subject: [PATCH 19/49] refactor: move readback logic from CacheBase to SqlProcessorBase per review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add ColumnStatistics and TableStatistics pydantic models to sql_processor.py - Add generic public methods on SqlProcessorBase: get_row_count(), get_column_info(), get_column_stats(), get_table_statistics() — no 'readback' naming - Delete all _readback_* private methods from CacheBase - Delete _readback_quote_identifier override from BigQueryCache - Add thin public get_table_statistics() on CacheBase that delegates to processor - Update destination_smoke_tests.py to use new structured API - Remove broad try/except handlers — let exceptions propagate Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 127 ++++--------- airbyte/caches/base.py | 213 ++-------------------- airbyte/caches/bigquery.py | 4 - airbyte/shared/sql_processor.py | 216 +++++++++++++++++++++-- 4 files changed, 241 insertions(+), 319 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index 7224d1d8f..1d66bb981 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -28,6 +28,7 @@ from airbyte import get_source from airbyte.exceptions import PyAirbyteInputError +from airbyte.shared.sql_processor import TableStatistics # noqa: TC001 # Pydantic needs at runtime logger = logging.getLogger(__name__) @@ -73,71 +74,12 @@ def generate_namespace( # --------------------------------------------------------------------------- -class ColumnStats(BaseModel): - """Null/non-null statistics for a single column.""" - - column_name: str - """The column name as found in the destination.""" - - null_count: int - """Number of NULL values in this column.""" - - non_null_count: int - """Number of non-NULL values in this column.""" - - total_count: int - """Total row count (null_count + non_null_count).""" - - -class ColumnInfo(BaseModel): - """Column name and type information.""" - - column_name: str - """The column name as found in the destination.""" - - column_type: str - """The SQL data type name as reported by the database.""" - - -class TableInfo(BaseModel): - """Basic table info: name and row count.""" - - table_name: str - """The table name as found in the destination.""" - - row_count: int - """Number of rows in the table.""" - - expected_stream_name: str - """The original stream name that this table corresponds to.""" - - -class TableReadbackReport(BaseModel): - """Full readback report for a single table.""" - - table_name: str - """The table name as found in the destination.""" - - expected_stream_name: str - """The original stream name.""" - - row_count: int - """Number of rows found.""" - - columns: list[ColumnInfo] - """Column names and types.""" - - column_stats: list[ColumnStats] - """Per-column null/non-null statistics.""" - - class DestinationReadbackResult(BaseModel): """Result of reading back destination-written data. - Contains three logical datasets: - 1. tables - list of tables with row counts - 2. columns - per-table column names and types - 3. column_stats - per-table, per-column null/non-null counts + Uses ``TableStatistics`` from the SQL processor layer to provide + per-table row counts, column names/types, and per-column null/non-null + counts. """ destination: str @@ -149,11 +91,8 @@ class DestinationReadbackResult(BaseModel): readback_supported: bool """Whether readback was supported for this destination.""" - tables: list[TableInfo] - """Dataset 1: Tables found with row counts.""" - - table_reports: list[TableReadbackReport] - """Full per-table reports including columns and stats.""" + table_statistics: dict[str, TableStatistics] + """Map of stream name to table statistics (row counts, columns, stats).""" tables_missing: list[str] """Stream names for which the expected table was not found.""" @@ -163,35 +102,44 @@ class DestinationReadbackResult(BaseModel): def get_tables_summary(self) -> list[dict[str, Any]]: """Return dataset 1: tables with row counts as plain dicts.""" - return [t.model_dump() for t in self.tables] + return [ + { + "stream_name": stream_name, + "table_name": stats.table_name, + "row_count": stats.row_count, + } + for stream_name, stats in self.table_statistics.items() + ] def get_columns_summary(self) -> list[dict[str, Any]]: """Return dataset 2: columns with types, grouped by table.""" result = [] - for report in self.table_reports: + for stream_name, stats in self.table_statistics.items(): result.extend( { - "table_name": report.table_name, + "stream_name": stream_name, + "table_name": stats.table_name, "column_name": col.column_name, "column_type": col.column_type, } - for col in report.columns + for col in stats.column_statistics ) return result def get_column_stats_summary(self) -> list[dict[str, Any]]: """Return dataset 3: per-column null/non-null counts.""" result = [] - for report in self.table_reports: + for stream_name, stats in self.table_statistics.items(): result.extend( { - "table_name": report.table_name, - "column_name": stat.column_name, - "null_count": stat.null_count, - "non_null_count": stat.non_null_count, - "total_count": stat.total_count, + "stream_name": stream_name, + "table_name": stats.table_name, + "column_name": col.column_name, + "null_count": col.null_count, + "non_null_count": col.non_null_count, + "total_count": col.total_count, } - for stat in report.column_stats + for col in stats.column_statistics ) return result @@ -436,23 +384,14 @@ def run_destination_smoke_test( if success: try: cache = destination.get_sql_cache(schema_name=namespace) - raw = cache._readback_get_results(stream_names) # noqa: SLF001 + table_statistics = cache.get_table_statistics(stream_names) + tables_missing = [name for name in stream_names if name not in table_statistics] readback_result = DestinationReadbackResult( destination=destination.name, namespace=namespace, readback_supported=True, - tables=[TableInfo(**t) for t in raw["tables"]], - table_reports=[ - TableReadbackReport( - table_name=r["table_name"], - expected_stream_name=r["expected_stream_name"], - row_count=r["row_count"], - columns=[ColumnInfo(**c) for c in r["columns"]], - column_stats=[ColumnStats(**s) for s in r["column_stats"]], - ) - for r in raw["table_reports"] - ], - tables_missing=raw["tables_missing"], + table_statistics=table_statistics, + tables_missing=tables_missing, ) except ValueError: # destination_to_cache raises ValueError for unsupported types @@ -460,12 +399,6 @@ def run_destination_smoke_test( "Readback not supported for destination '%s'.", destination.name, ) - except Exception as ex: - logger.warning( - "Readback failed for destination '%s': %s", - destination.name, - _sanitize_error(ex), - ) return DestinationSmokeTestResult( success=success, diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index 06d138e19..e3f5ebbce 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -4,14 +4,12 @@ from __future__ import annotations import contextlib -import logging from pathlib import Path from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, final import pandas as pd import pyarrow as pa import pyarrow.dataset as ds -import sqlalchemy as sa from pydantic import Field, PrivateAttr from sqlalchemy import exc as sqlalchemy_exc from sqlalchemy import text @@ -26,7 +24,7 @@ from airbyte.constants import DEFAULT_ARROW_MAX_CHUNK_SIZE, TEMP_FILE_CLEANUP from airbyte.datasets._sql import CachedDataset from airbyte.shared.catalog_providers import CatalogProvider -from airbyte.shared.sql_processor import SqlConfig +from airbyte.shared.sql_processor import SqlConfig, TableStatistics from airbyte.shared.state_writers import StdOutStateWriter @@ -441,211 +439,20 @@ def __iter__( # type: ignore [override] # Overriding Pydantic model method """Iterate over the streams in the cache.""" return ((name, dataset) for name, dataset in self.streams.items()) - # ---- Readback introspection helpers ---- - # These private methods support destination readback introspection, - # allowing the cache to query stats about data written by a destination. - - def _readback_quote_identifier(self, identifier: str) -> str: - """Quote an identifier for use in readback SQL queries. - - Defaults to ANSI double-quote style. Subclasses whose SQL dialect - uses a different quoting convention (e.g. BigQuery backticks) should - override this method. - """ - return f'"{identifier}"' - - def _readback_query_row_count( - self, - table_name: str, - ) -> int | None: - """Query the row count for a table. Returns None if the table doesn't exist.""" - try: - quoted_table = self._readback_quote_identifier(table_name) - result = self.run_sql_query( - f"SELECT COUNT(*) AS row_count FROM {self.schema_name}.{quoted_table}", - ) - if result: - row_ci = {k.lower(): v for k, v in result[0].items()} - return int(row_ci.get("row_count", 0)) - return 0 # noqa: TRY300 - except Exception: - logging.getLogger(__name__).debug( - "Table %s.%s not found or not accessible.", - self.schema_name, - table_name, - ) - return None - - def _readback_query_column_info( - self, - table_name: str, - ) -> list[dict[str, str]]: - """Query column names and types for a table. - - Returns a list of dicts with 'column_name' and 'column_type' keys. - """ - try: - engine = self.get_sql_engine() - inspector = sa.inspect(engine) - columns = inspector.get_columns(table_name, schema=self.schema_name) - return [ - { - "column_name": col["name"], - "column_type": str(col["type"]), - } - for col in columns - ] - except Exception: - logging.getLogger(__name__).debug( - "Could not get column info for %s.%s", - self.schema_name, - table_name, - ) - return [] - - def _readback_query_column_stats( - self, - table_name: str, - columns: list[dict[str, str]], - ) -> list[dict[str, Any]]: - """Query per-column null/non-null counts. - - Args: - table_name: The table to query. - columns: List of dicts with at least a 'column_name' key. - - Returns a list of dicts with column_name, null_count, non_null_count, - total_count keys. - """ - if not columns: - return [] - - # Build a SQL query that computes COUNT(*), COUNT(col) for each column. - # COUNT(*) gives total rows, COUNT(col) gives non-null count. - # We use positional aliases (nn_0, nn_1, ...) to avoid issues with - # databases that truncate long identifiers (PostgreSQL: 63 chars). - count_exprs: list[str] = [] - for idx, col in enumerate(columns): - col_name = col["column_name"] - quoted = self._readback_quote_identifier(col_name) - count_exprs.append(f"COUNT({quoted}) AS nn_{idx}") - - count_exprs_str = ", ".join(count_exprs) - sql = ( - f"SELECT COUNT(*) AS total_rows, {count_exprs_str} " - f"FROM {self.schema_name}.{self._readback_quote_identifier(table_name)}" - ) - - try: - result = self.run_sql_query(sql) - except Exception: - logging.getLogger(__name__).debug( - "Could not query column stats for %s.%s", - self.schema_name, - table_name, - ) - return [] - - if not result: - return [] - - row = result[0] - # Case-insensitive key lookup: some DBs uppercase (Snowflake) or - # lowercase (PostgreSQL) unquoted SQL aliases. - row_ci = {k.lower(): v for k, v in row.items()} - total_rows = int(row_ci.get("total_rows", 0)) - - stats = [] - for idx, col in enumerate(columns): - col_name = col["column_name"] - non_null_key = f"nn_{idx}" - val = row_ci.get(non_null_key) - non_null_count = int(val) if val is not None else 0 - stats.append( - { - "column_name": col_name, - "null_count": total_rows - non_null_count, - "non_null_count": non_null_count, - "total_count": total_rows, - } - ) - - return stats - - def _readback_get_table_report( - self, - table_name: str, - stream_name: str, - ) -> dict[str, Any] | None: - """Build a full readback report for a single table. - - Composes ``_readback_query_row_count``, ``_readback_query_column_info``, - and ``_readback_query_column_stats`` into a single dict suitable for - constructing a ``TableReadbackReport``. - - Returns ``None`` when the table does not exist (row count is ``None``). - """ - row_count = self._readback_query_row_count(table_name) - if row_count is None: - return None - - raw_columns = self._readback_query_column_info(table_name) - raw_stats = self._readback_query_column_stats(table_name, raw_columns) - - return { - "table_name": table_name, - "expected_stream_name": stream_name, - "row_count": row_count, - "columns": raw_columns, - "column_stats": raw_stats, - } - - def _readback_get_results( + def get_table_statistics( self, stream_names: list[str], - ) -> dict[str, Any]: - """Run readback introspection for the given streams. + ) -> dict[str, TableStatistics]: + """Return table statistics for the given stream names. - For each stream, resolves the expected table name via the cache's - built-in processor normalizer, then queries row counts, column info, - and column stats. + Delegates to ``self.processor.get_table_statistics()`` which queries + row counts, column info, and per-column null/non-null stats for each + stream. - Returns a dict with keys ``tables``, ``table_reports``, and - ``tables_missing`` ready for constructing a result model. + Returns a dict mapping stream name to a ``TableStatistics`` instance. + Streams whose tables are not found are omitted from the result. """ - tables: list[dict[str, Any]] = [] - table_reports: list[dict[str, Any]] = [] - tables_missing: list[str] = [] - - for stream_name in stream_names: - expected_table = self.processor.get_sql_table_name(stream_name) - report = self._readback_get_table_report(expected_table, stream_name) - - # Fallback: if the normalized name isn't found, try the - # original stream name. Some destinations preserve the - # original casing (e.g. "CamelCaseStreamName") while the - # cache normalizer lowercases it. - if report is None and expected_table != stream_name: - report = self._readback_get_table_report(stream_name, stream_name) - - if report is None: - tables_missing.append(stream_name) - continue - - tables.append( - { - "table_name": report["table_name"], - "row_count": report["row_count"], - "expected_stream_name": stream_name, - } - ) - table_reports.append(report) - - return { - "tables": tables, - "table_reports": table_reports, - "tables_missing": tables_missing, - } + return self.processor.get_table_statistics(stream_names) def _write_airbyte_message_stream( self, diff --git a/airbyte/caches/bigquery.py b/airbyte/caches/bigquery.py index eec898155..a6aaf71e1 100644 --- a/airbyte/caches/bigquery.py +++ b/airbyte/caches/bigquery.py @@ -43,10 +43,6 @@ class BigQueryCache(BigQueryConfig, CacheBase): paired_destination_name: ClassVar[str | None] = "destination-bigquery" paired_destination_config_class: ClassVar[type | None] = DestinationBigquery - def _readback_quote_identifier(self, identifier: str) -> str: - """BigQuery uses backticks instead of ANSI double-quotes.""" - return f"`{identifier}`" - @property def paired_destination_config(self) -> DestinationBigquery: """Return a dictionary of destination configuration values.""" diff --git a/airbyte/shared/sql_processor.py b/airbyte/shared/sql_processor.py index 1663ad23d..e9cd9e726 100644 --- a/airbyte/shared/sql_processor.py +++ b/airbyte/shared/sql_processor.py @@ -84,6 +84,38 @@ class SQLRuntimeError(Exception): """Raised when an SQL operation fails.""" +class ColumnStatistics(BaseModel): + """Null/non-null statistics for a single column.""" + + column_name: str + """The column name as found in the destination.""" + + column_type: str + """The SQL data type name as reported by the database.""" + + null_count: int + """Number of NULL values in this column.""" + + non_null_count: int + """Number of non-NULL values in this column.""" + + total_count: int + """Total row count (null_count + non_null_count).""" + + +class TableStatistics(BaseModel): + """Statistics for a single table: row count, column info, and per-column stats.""" + + table_name: str + """The table name as found in the destination.""" + + row_count: int + """Number of rows found.""" + + column_statistics: list[ColumnStatistics] + """Per-column names, types, and null/non-null statistics.""" + + class SqlConfig(BaseModel, abc.ABC): """Common configuration for SQL connections.""" @@ -213,12 +245,9 @@ def __init__( ] = defaultdict(list, {}) self._setup() - self.file_writer = ( - file_writer - or self.file_writer_class( # pyrefly: ignore[bad-instantiation] - cache_dir=cast("Path", temp_dir), - cleanup=temp_file_cleanup, - ) + self.file_writer = file_writer or self.file_writer_class( # pyrefly: ignore[bad-instantiation] + cache_dir=cast("Path", temp_dir), + cleanup=temp_file_cleanup, ) self.type_converter = self.type_converter_class() self._cached_table_definitions: dict[str, sqlalchemy.Table] = {} @@ -543,9 +572,9 @@ def _ensure_schema_exists( if DEBUG_MODE: found_schemas = schemas_list - assert ( - schema_name in found_schemas - ), f"Schema {schema_name} was not created. Found: {found_schemas}" + assert schema_name in found_schemas, ( + f"Schema {schema_name} was not created. Found: {found_schemas}" + ) def _quote_identifier(self, identifier: str) -> str: """Return the given identifier, quoted.""" @@ -1012,10 +1041,10 @@ def _append_temp_table_to_final_table( self._execute_sql( f""" INSERT INTO {self._fully_qualified(final_table_name)} ( - {f',{nl} '.join(columns)} + {f",{nl} ".join(columns)} ) SELECT - {f',{nl} '.join(columns)} + {f",{nl} ".join(columns)} FROM {self._fully_qualified(temp_table_name)} """, ) @@ -1040,8 +1069,7 @@ def _swap_temp_table_with_final_table( deletion_name = f"{final_table_name}_deleteme" commands = "\n".join( [ - f"ALTER TABLE {self._fully_qualified(final_table_name)} RENAME " - f"TO {deletion_name};", + f"ALTER TABLE {self._fully_qualified(final_table_name)} RENAME TO {deletion_name};", f"ALTER TABLE {self._fully_qualified(temp_table_name)} RENAME " f"TO {final_table_name};", f"DROP TABLE {self._fully_qualified(deletion_name)};", @@ -1081,10 +1109,10 @@ def _merge_temp_table_to_final_table( {set_clause} WHEN NOT MATCHED THEN INSERT ( - {f',{nl} '.join(columns)} + {f",{nl} ".join(columns)} ) VALUES ( - tmp.{f',{nl} tmp.'.join(columns)} + tmp.{f",{nl} tmp.".join(columns)} ); """, ) @@ -1179,3 +1207,161 @@ def _table_exists( Subclasses may override this method to provide a more efficient implementation. """ return table_name in self._get_tables_list() + + # ---- Table introspection helpers ---- + + def get_row_count( + self, + table_name: str, + ) -> int: + """Return the number of rows in the given table. + + Raises ``SQLRuntimeError`` if the table does not exist or the query + fails for any other reason. + """ + sql = f"SELECT COUNT(*) AS row_count FROM {self._fully_qualified(table_name)}" + result = self._execute_sql(sql) + row = result.mappings().fetchone() + if row is None: + return 0 + # Case-insensitive key lookup: Snowflake uppercases unquoted aliases. + row_ci = {k.lower(): v for k, v in row.items()} + return int(row_ci.get("row_count", 0)) + + def get_column_info( + self, + table_name: str, + ) -> list[dict[str, str]]: + """Return column names and types for the given table. + + Each entry is a dict with ``column_name`` and ``column_type`` keys. + + Raises if the table does not exist or is not accessible. + """ + engine = self.get_sql_engine() + inspector: Inspector = sqlalchemy.inspect(engine) + columns = inspector.get_columns(table_name, schema=self.sql_config.schema_name) + return [ + { + "column_name": col["name"], + "column_type": str(col["type"]), + } + for col in columns + ] + + def get_column_stats( + self, + table_name: str, + columns: list[dict[str, str]], + ) -> list[dict[str, Any]]: + """Return per-column null/non-null counts for the given table. + + *columns* should be a list of dicts with at least a ``column_name`` + key (as returned by :meth:`get_column_info`). + + Returns a list of dicts with ``column_name``, ``null_count``, + ``non_null_count``, and ``total_count`` keys. + + Positional aliases (``nn_0``, ``nn_1``, ...) are used instead of + column-name-derived aliases to avoid issues with databases that + truncate long identifiers (PostgreSQL: 63 chars). + """ + if not columns: + return [] + + count_exprs: list[str] = [] + for idx, col in enumerate(columns): + col_name = col["column_name"] + quoted = self._quote_identifier(col_name) + count_exprs.append(f"COUNT({quoted}) AS nn_{idx}") + + count_exprs_str = ", ".join(count_exprs) + sql = ( + f"SELECT COUNT(*) AS total_rows, {count_exprs_str} " + f"FROM {self._fully_qualified(table_name)}" + ) + + result = self._execute_sql(sql) + row = result.mappings().fetchone() + if row is None: + return [] + + # Case-insensitive key lookup for cross-DB compatibility. + row_ci = {k.lower(): v for k, v in row.items()} + total_rows = int(row_ci.get("total_rows", 0)) + + stats = [] + for idx, col in enumerate(columns): + col_name = col["column_name"] + non_null_key = f"nn_{idx}" + val = row_ci.get(non_null_key) + non_null_count = int(val) if val is not None else 0 + stats.append( + { + "column_name": col_name, + "null_count": total_rows - non_null_count, + "non_null_count": non_null_count, + "total_count": total_rows, + } + ) + + return stats + + def get_table_statistics( + self, + stream_names: list[str], + ) -> dict[str, TableStatistics]: + """Return table statistics for the given stream names. + + For each stream, resolves the expected table name via the processor's + normalizer, queries row counts, column info, and per-column null/non-null + stats. + + If the normalized table name is not found, falls back to the original + stream name (some destinations preserve original casing). + + Returns a dict mapping stream name to a ``TableStatistics`` instance. + Streams whose tables are not found are omitted from the result. + """ + result: dict[str, TableStatistics] = {} + + for stream_name in stream_names: + expected_table = self.get_sql_table_name(stream_name) + + # Try the normalized name first, then fall back to original. + table_name: str | None = None + for candidate in (expected_table, stream_name): + if self._table_exists(candidate): + table_name = candidate + break + + if table_name is None: + continue + + row_count = self.get_row_count(table_name) + columns = self.get_column_info(table_name) + stats = self.get_column_stats(table_name, columns) + + # Merge column info and stats into ColumnStatistics objects. + stats_by_col = {s["column_name"]: s for s in stats} + col_statistics: list[ColumnStatistics] = [] + for col in columns: + col_name = col["column_name"] + col_stat = stats_by_col.get(col_name, {}) + col_statistics.append( + ColumnStatistics( + column_name=col_name, + column_type=col["column_type"], + null_count=col_stat.get("null_count", 0), + non_null_count=col_stat.get("non_null_count", 0), + total_count=col_stat.get("total_count", 0), + ) + ) + + result[stream_name] = TableStatistics( + table_name=table_name, + row_count=row_count, + column_statistics=col_statistics, + ) + + return result From f0b26d775319531cfe8ea3d083ba861957a0e80e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 16:46:00 +0000 Subject: [PATCH 20/49] fix: restore CI-expected formatting in sql_processor.py Co-Authored-By: AJ Steers --- airbyte/shared/sql_processor.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/airbyte/shared/sql_processor.py b/airbyte/shared/sql_processor.py index e9cd9e726..d68f13d8b 100644 --- a/airbyte/shared/sql_processor.py +++ b/airbyte/shared/sql_processor.py @@ -245,9 +245,12 @@ def __init__( ] = defaultdict(list, {}) self._setup() - self.file_writer = file_writer or self.file_writer_class( # pyrefly: ignore[bad-instantiation] - cache_dir=cast("Path", temp_dir), - cleanup=temp_file_cleanup, + self.file_writer = ( + file_writer + or self.file_writer_class( # pyrefly: ignore[bad-instantiation] + cache_dir=cast("Path", temp_dir), + cleanup=temp_file_cleanup, + ) ) self.type_converter = self.type_converter_class() self._cached_table_definitions: dict[str, sqlalchemy.Table] = {} @@ -572,9 +575,9 @@ def _ensure_schema_exists( if DEBUG_MODE: found_schemas = schemas_list - assert schema_name in found_schemas, ( - f"Schema {schema_name} was not created. Found: {found_schemas}" - ) + assert ( + schema_name in found_schemas + ), f"Schema {schema_name} was not created. Found: {found_schemas}" def _quote_identifier(self, identifier: str) -> str: """Return the given identifier, quoted.""" From 174bbfbf3d2081666286c84a9fa2327d3e2356b0 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 16:52:01 +0000 Subject: [PATCH 21/49] style: convert all docstrings from reST to Markdown formatting Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 16 ++++++++-------- airbyte/caches/base.py | 4 ++-- airbyte/destinations/base.py | 6 +++--- airbyte/shared/sql_processor.py | 16 ++++++++-------- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index 1d66bb981..c35b3511c 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -37,7 +37,7 @@ NAMESPACE_PREFIX = "zz_deleteme" """Prefix for auto-generated smoke test namespaces. -The ``zz_`` prefix sorts last alphabetically; ``deleteme`` signals the +The `zz_` prefix sorts last alphabetically; `deleteme` signals the namespace is safe for automated cleanup. """ @@ -56,11 +56,11 @@ def generate_namespace( ) -> str: """Generate a smoke-test namespace. - Format: ``zz_deleteme_yyyymmdd_hhmm_``. - The ``zz_`` prefix sorts last alphabetically and the ``deleteme`` + Format: `zz_deleteme_yyyymmdd_hhmm_`. + The `zz_` prefix sorts last alphabetically and the `deleteme` token acts as a guard for automated cleanup scripts. - If *namespace_suffix* is not provided, ``smoke_test`` is used as the + If `namespace_suffix` is not provided, `smoke_test` is used as the default suffix. """ suffix = namespace_suffix or DEFAULT_NAMESPACE_SUFFIX @@ -77,7 +77,7 @@ def generate_namespace( class DestinationReadbackResult(BaseModel): """Result of reading back destination-written data. - Uses ``TableStatistics`` from the SQL processor layer to provide + Uses `TableStatistics` from the SQL processor layer to provide per-table row counts, column names/types, and per-column null/non-null counts. """ @@ -319,7 +319,7 @@ def run_destination_smoke_test( introspection is automatically performed after a successful write. The readback produces stats on the written data (table row counts, column names/types, and per-column null/non-null counts) and is - included in the result as ``readback_result``. + included in the result as `readback_result`. `destination` is a resolved `Destination` object ready for writing. @@ -330,8 +330,8 @@ def run_destination_smoke_test( - A comma-separated string or list of specific scenario names. `namespace_suffix` is an optional suffix appended to the auto-generated - namespace. Defaults to ``smoke_test`` when not provided - (e.g. ``zz_deleteme_20260318_2256_smoke_test``). + namespace. Defaults to `smoke_test` when not provided + (e.g. `zz_deleteme_20260318_2256_smoke_test`). `reuse_namespace` is an exact namespace string to reuse from a previous run. When set, no new namespace is generated. diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index e3f5ebbce..58d6c05a8 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -445,11 +445,11 @@ def get_table_statistics( ) -> dict[str, TableStatistics]: """Return table statistics for the given stream names. - Delegates to ``self.processor.get_table_statistics()`` which queries + Delegates to `self.processor.get_table_statistics()` which queries row counts, column info, and per-column null/non-null stats for each stream. - Returns a dict mapping stream name to a ``TableStatistics`` instance. + Returns a dict mapping stream name to a `TableStatistics` instance. Streams whose tables are not found are omitted from the result. """ return self.processor.get_table_statistics(stream_names) diff --git a/airbyte/destinations/base.py b/airbyte/destinations/base.py index d66d91762..0cb3f65a1 100644 --- a/airbyte/destinations/base.py +++ b/airbyte/destinations/base.py @@ -66,10 +66,10 @@ def __init__( @staticmethod def _normalize_destination_name(name: str) -> str: - """Normalize a destination name to canonical form (``destination-``). + """Normalize a destination name to canonical form (`destination-`). - Accepts either the short form (e.g. ``snowflake``) or the canonical - form (e.g. ``destination-snowflake``). + Accepts either the short form (e.g. `snowflake`) or the canonical + form (e.g. `destination-snowflake`). """ if not name.startswith(_CANONICAL_PREFIX): return f"{_CANONICAL_PREFIX}{name}" diff --git a/airbyte/shared/sql_processor.py b/airbyte/shared/sql_processor.py index d68f13d8b..7408ce5be 100644 --- a/airbyte/shared/sql_processor.py +++ b/airbyte/shared/sql_processor.py @@ -1219,7 +1219,7 @@ def get_row_count( ) -> int: """Return the number of rows in the given table. - Raises ``SQLRuntimeError`` if the table does not exist or the query + Raises `SQLRuntimeError` if the table does not exist or the query fails for any other reason. """ sql = f"SELECT COUNT(*) AS row_count FROM {self._fully_qualified(table_name)}" @@ -1237,7 +1237,7 @@ def get_column_info( ) -> list[dict[str, str]]: """Return column names and types for the given table. - Each entry is a dict with ``column_name`` and ``column_type`` keys. + Each entry is a dict with `column_name` and `column_type` keys. Raises if the table does not exist or is not accessible. """ @@ -1259,13 +1259,13 @@ def get_column_stats( ) -> list[dict[str, Any]]: """Return per-column null/non-null counts for the given table. - *columns* should be a list of dicts with at least a ``column_name`` - key (as returned by :meth:`get_column_info`). + `columns` should be a list of dicts with at least a `column_name` + key (as returned by `get_column_info()`). - Returns a list of dicts with ``column_name``, ``null_count``, - ``non_null_count``, and ``total_count`` keys. + Returns a list of dicts with `column_name`, `null_count`, + `non_null_count`, and `total_count` keys. - Positional aliases (``nn_0``, ``nn_1``, ...) are used instead of + Positional aliases (`nn_0`, `nn_1`, ...) are used instead of column-name-derived aliases to avoid issues with databases that truncate long identifiers (PostgreSQL: 63 chars). """ @@ -1323,7 +1323,7 @@ def get_table_statistics( If the normalized table name is not found, falls back to the original stream name (some destinations preserve original casing). - Returns a dict mapping stream name to a ``TableStatistics`` instance. + Returns a dict mapping stream name to a `TableStatistics` instance. Streams whose tables are not found are omitted from the result. """ result: dict[str, TableStatistics] = {} From 1e5c45b681127428640ca123bc77e1cdf72c311b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 16:59:55 +0000 Subject: [PATCH 22/49] fix: make stats fields nullable, broaden readback exception handling per review Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 6 ++++++ airbyte/shared/sql_processor.py | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index c35b3511c..98d49f0d4 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -399,6 +399,12 @@ def run_destination_smoke_test( "Readback not supported for destination '%s'.", destination.name, ) + except Exception: + logger.warning( + "Readback failed for destination '%s'.", + destination.name, + exc_info=True, + ) return DestinationSmokeTestResult( success=success, diff --git a/airbyte/shared/sql_processor.py b/airbyte/shared/sql_processor.py index 7408ce5be..a017a83cd 100644 --- a/airbyte/shared/sql_processor.py +++ b/airbyte/shared/sql_processor.py @@ -93,13 +93,13 @@ class ColumnStatistics(BaseModel): column_type: str """The SQL data type name as reported by the database.""" - null_count: int + null_count: int | None = None """Number of NULL values in this column.""" - non_null_count: int + non_null_count: int | None = None """Number of non-NULL values in this column.""" - total_count: int + total_count: int | None = None """Total row count (null_count + non_null_count).""" @@ -109,7 +109,7 @@ class TableStatistics(BaseModel): table_name: str """The table name as found in the destination.""" - row_count: int + row_count: int | None = None """Number of rows found.""" column_statistics: list[ColumnStatistics] From a1187b4a1448b8d0774a9065de569065e7597873 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:03:25 +0000 Subject: [PATCH 23/49] docs: add Markdown docstring style guideline to CONTRIBUTING.md Co-Authored-By: AJ Steers --- CONTRIBUTING.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8b659fdf0..15ae64644 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,6 +2,14 @@ Thank you for your interest in contributing to PyAirbyte! +## Docstring Style + +Use **Markdown** formatting in all docstrings — not reStructuredText (reST). + +- Use single backticks for inline code: `` `MyClass` ``, not double backticks (` ``MyClass`` `). +- Reference methods as `` `get_column_info()` ``, not `:meth:\`get_column_info\``. +- Use standard Markdown for emphasis, lists, and links. + ## 🚀 Releasing This project uses [`semantic-pr-release-drafter`](https://github.com/aaronsteers/semantic-pr-release-drafter) for automated release management. To release, simply click "`Edit`" on the latest release draft from the [releases page](https://github.com/airbytehq/PyAirbyte/releases), and then click "`Publish release`". This publish operation will trigger all necessary downstream publish operations. From 8a5ea93fa10a3edc2bf044f7a33cbb08a83c1cf6 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:12:29 +0000 Subject: [PATCH 24/49] fix: pass schema_name through destination_to_cache, add db/schema to TableStatistics, fix docstring Co-Authored-By: AJ Steers --- airbyte/destinations/_translate_dest_to_cache.py | 7 ++++++- airbyte/destinations/base.py | 5 +---- airbyte/mcp/local.py | 2 +- airbyte/shared/sql_processor.py | 8 ++++++++ 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/airbyte/destinations/_translate_dest_to_cache.py b/airbyte/destinations/_translate_dest_to_cache.py index dd3cefcb0..7ba8a6d64 100644 --- a/airbyte/destinations/_translate_dest_to_cache.py +++ b/airbyte/destinations/_translate_dest_to_cache.py @@ -35,6 +35,8 @@ def destination_to_cache( destination_configuration: api_util.DestinationConfiguration | dict[str, Any], + *, + schema_name: str | None = None, ) -> CacheBase: """Get the destination configuration from the cache.""" conversion_fn_map: dict[str, Callable[[Any], CacheBase]] = { @@ -71,7 +73,10 @@ def destination_to_cache( ) conversion_fn = conversion_fn_map[destination_type] - return conversion_fn(destination_configuration) + cache = conversion_fn(destination_configuration) + if schema_name is not None: + cache.schema_name = schema_name + return cache def bigquery_destination_to_cache( diff --git a/airbyte/destinations/base.py b/airbyte/destinations/base.py index 0cb3f65a1..9fceb2cd3 100644 --- a/airbyte/destinations/base.py +++ b/airbyte/destinations/base.py @@ -128,10 +128,7 @@ def get_sql_cache( dest_type = resolved_name.replace(_CANONICAL_PREFIX, "") config["destinationType"] = dest_type - cache: CacheBase = destination_to_cache(config) - - if schema_name is not None: - cache = cache.model_copy(update={"schema_name": schema_name}) + cache: CacheBase = destination_to_cache(config, schema_name=schema_name) return cache diff --git a/airbyte/mcp/local.py b/airbyte/mcp/local.py index 4ba136f31..5dd0a06d0 100644 --- a/airbyte/mcp/local.py +++ b/airbyte/mcp/local.py @@ -915,7 +915,7 @@ def destination_smoke_test( # noqa: PLR0913, PLR0917 automatically performed after a successful write. The readback produces stats on the written data: table row counts, column names/types, and per-column null/non-null counts. Results are included in the response - as ``readback_result``. + as `readback_result`. """ # Resolve destination config config_dict = resolve_connector_config( diff --git a/airbyte/shared/sql_processor.py b/airbyte/shared/sql_processor.py index a017a83cd..29f74d63c 100644 --- a/airbyte/shared/sql_processor.py +++ b/airbyte/shared/sql_processor.py @@ -109,6 +109,12 @@ class TableStatistics(BaseModel): table_name: str """The table name as found in the destination.""" + database_name: str | None = None + """The database name where this table resides.""" + + schema_name: str | None = None + """The schema name where this table resides.""" + row_count: int | None = None """Number of rows found.""" @@ -1363,6 +1369,8 @@ def get_table_statistics( result[stream_name] = TableStatistics( table_name=table_name, + database_name=self.database_name, + schema_name=self.sql_config.schema_name, row_count=row_count, column_statistics=col_statistics, ) From 2aea3d0a84d9583889debe8c28246b471793cf41 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:14:08 +0000 Subject: [PATCH 25/49] refactor: improve get_column_info docstring, share inspector across tables Co-Authored-By: AJ Steers --- airbyte/shared/sql_processor.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/airbyte/shared/sql_processor.py b/airbyte/shared/sql_processor.py index 29f74d63c..fe9f9b760 100644 --- a/airbyte/shared/sql_processor.py +++ b/airbyte/shared/sql_processor.py @@ -1240,15 +1240,28 @@ def get_row_count( def get_column_info( self, table_name: str, + *, + inspector: Inspector | None = None, ) -> list[dict[str, str]]: - """Return column names and types for the given table. + """Return actual column names and types for the given table. + + This method differs from `_get_sql_column_definitions` in that it always + returns actual detected column types from the database. It will never + return previously-cached types or 'expected' types based on the stream + JSON schema. Each entry is a dict with `column_name` and `column_type` keys. + Args: + table_name: The table to inspect. + inspector: An optional pre-created SQLAlchemy `Inspector` to reuse. + When inspecting many tables, passing a shared inspector avoids + creating a new one per call. + Raises if the table does not exist or is not accessible. """ - engine = self.get_sql_engine() - inspector: Inspector = sqlalchemy.inspect(engine) + if inspector is None: + inspector = sqlalchemy.inspect(self.get_sql_engine()) columns = inspector.get_columns(table_name, schema=self.sql_config.schema_name) return [ { @@ -1334,6 +1347,9 @@ def get_table_statistics( """ result: dict[str, TableStatistics] = {} + # Share a single inspector across all tables to avoid repeated creation. + shared_inspector: Inspector = sqlalchemy.inspect(self.get_sql_engine()) + for stream_name in stream_names: expected_table = self.get_sql_table_name(stream_name) @@ -1348,7 +1364,7 @@ def get_table_statistics( continue row_count = self.get_row_count(table_name) - columns = self.get_column_info(table_name) + columns = self.get_column_info(table_name, inspector=shared_inspector) stats = self.get_column_stats(table_name, columns) # Merge column info and stats into ColumnStatistics objects. From b86d561ea07d559f188f1cf6e2e8a2061e5529ad Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:15:20 +0000 Subject: [PATCH 26/49] refactor: make get_column_stats private (_get_column_stats) Co-Authored-By: AJ Steers --- airbyte/shared/sql_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte/shared/sql_processor.py b/airbyte/shared/sql_processor.py index fe9f9b760..117d66f7b 100644 --- a/airbyte/shared/sql_processor.py +++ b/airbyte/shared/sql_processor.py @@ -1271,7 +1271,7 @@ def get_column_info( for col in columns ] - def get_column_stats( + def _get_column_stats( self, table_name: str, columns: list[dict[str, str]], @@ -1365,7 +1365,7 @@ def get_table_statistics( row_count = self.get_row_count(table_name) columns = self.get_column_info(table_name, inspector=shared_inspector) - stats = self.get_column_stats(table_name, columns) + stats = self._get_column_stats(table_name, columns) # Merge column info and stats into ColumnStatistics objects. stats_by_col = {s["column_name"]: s for s in stats} From 8673f9888e1cc764ad86d5ad24544576dd68f3ec Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:16:52 +0000 Subject: [PATCH 27/49] refactor: rename get_* to fetch_* for DB-hitting introspection methods Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 2 +- airbyte/caches/base.py | 6 +++--- airbyte/shared/sql_processor.py | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index 98d49f0d4..ff9d62c0a 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -384,7 +384,7 @@ def run_destination_smoke_test( if success: try: cache = destination.get_sql_cache(schema_name=namespace) - table_statistics = cache.get_table_statistics(stream_names) + table_statistics = cache.fetch_table_statistics(stream_names) tables_missing = [name for name in stream_names if name not in table_statistics] readback_result = DestinationReadbackResult( destination=destination.name, diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index 58d6c05a8..2683577d4 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -439,20 +439,20 @@ def __iter__( # type: ignore [override] # Overriding Pydantic model method """Iterate over the streams in the cache.""" return ((name, dataset) for name, dataset in self.streams.items()) - def get_table_statistics( + def fetch_table_statistics( self, stream_names: list[str], ) -> dict[str, TableStatistics]: """Return table statistics for the given stream names. - Delegates to `self.processor.get_table_statistics()` which queries + Delegates to `self.processor.fetch_table_statistics()` which queries row counts, column info, and per-column null/non-null stats for each stream. Returns a dict mapping stream name to a `TableStatistics` instance. Streams whose tables are not found are omitted from the result. """ - return self.processor.get_table_statistics(stream_names) + return self.processor.fetch_table_statistics(stream_names) def _write_airbyte_message_stream( self, diff --git a/airbyte/shared/sql_processor.py b/airbyte/shared/sql_processor.py index 117d66f7b..add3e6592 100644 --- a/airbyte/shared/sql_processor.py +++ b/airbyte/shared/sql_processor.py @@ -1219,7 +1219,7 @@ def _table_exists( # ---- Table introspection helpers ---- - def get_row_count( + def fetch_row_count( self, table_name: str, ) -> int: @@ -1237,7 +1237,7 @@ def get_row_count( row_ci = {k.lower(): v for k, v in row.items()} return int(row_ci.get("row_count", 0)) - def get_column_info( + def fetch_column_info( self, table_name: str, *, @@ -1279,7 +1279,7 @@ def _get_column_stats( """Return per-column null/non-null counts for the given table. `columns` should be a list of dicts with at least a `column_name` - key (as returned by `get_column_info()`). + key (as returned by `fetch_column_info()`). Returns a list of dicts with `column_name`, `null_count`, `non_null_count`, and `total_count` keys. @@ -1329,7 +1329,7 @@ def _get_column_stats( return stats - def get_table_statistics( + def fetch_table_statistics( self, stream_names: list[str], ) -> dict[str, TableStatistics]: @@ -1363,8 +1363,8 @@ def get_table_statistics( if table_name is None: continue - row_count = self.get_row_count(table_name) - columns = self.get_column_info(table_name, inspector=shared_inspector) + row_count = self.fetch_row_count(table_name) + columns = self.fetch_column_info(table_name, inspector=shared_inspector) stats = self._get_column_stats(table_name, columns) # Merge column info and stats into ColumnStatistics objects. From 20acec8df4689414d99c28d72c1c9400d1c2b7fa Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:22:28 +0000 Subject: [PATCH 28/49] refactor: replace _get_stream_names_from_source with Source.get_selected_streams() Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index ff9d62c0a..0ef8341ec 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -295,12 +295,6 @@ def _sanitize_error(ex: Exception) -> str: return f"{type(ex).__name__}: {ex}" -def _get_stream_names_from_source(source_obj: Source) -> list[str]: - """Extract stream names from a configured source.""" - catalog = source_obj.get_configured_catalog() - return [stream.stream.name for stream in catalog.streams] - - def run_destination_smoke_test( *, destination: Destination, @@ -354,7 +348,7 @@ def run_destination_smoke_test( ) # Capture stream names for readback before the write consumes the source - stream_names = _get_stream_names_from_source(source_obj) + stream_names = source_obj.get_selected_streams() # Normalize scenarios to a display string if isinstance(scenarios, list): From dbc48c298a47c6e9ae87f18c13b7fe3b20de54ad Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:27:06 +0000 Subject: [PATCH 29/49] refactor: delete DestinationReadbackResult, flatten table_statistics into DestinationSmokeTestResult Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 109 ++++------------------- airbyte/mcp/local.py | 2 +- 2 files changed, 17 insertions(+), 94 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index 0ef8341ec..a1ef3c7af 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -69,81 +69,6 @@ def generate_namespace( return f"{NAMESPACE_PREFIX}_{ts}_{suffix}" -# --------------------------------------------------------------------------- -# Readback result models -# --------------------------------------------------------------------------- - - -class DestinationReadbackResult(BaseModel): - """Result of reading back destination-written data. - - Uses `TableStatistics` from the SQL processor layer to provide - per-table row counts, column names/types, and per-column null/non-null - counts. - """ - - destination: str - """The destination connector name.""" - - namespace: str - """The namespace (schema) that was inspected.""" - - readback_supported: bool - """Whether readback was supported for this destination.""" - - table_statistics: dict[str, TableStatistics] - """Map of stream name to table statistics (row counts, columns, stats).""" - - tables_missing: list[str] - """Stream names for which the expected table was not found.""" - - error: str | None = None - """Error message if readback failed.""" - - def get_tables_summary(self) -> list[dict[str, Any]]: - """Return dataset 1: tables with row counts as plain dicts.""" - return [ - { - "stream_name": stream_name, - "table_name": stats.table_name, - "row_count": stats.row_count, - } - for stream_name, stats in self.table_statistics.items() - ] - - def get_columns_summary(self) -> list[dict[str, Any]]: - """Return dataset 2: columns with types, grouped by table.""" - result = [] - for stream_name, stats in self.table_statistics.items(): - result.extend( - { - "stream_name": stream_name, - "table_name": stats.table_name, - "column_name": col.column_name, - "column_type": col.column_type, - } - for col in stats.column_statistics - ) - return result - - def get_column_stats_summary(self) -> list[dict[str, Any]]: - """Return dataset 3: per-column null/non-null counts.""" - result = [] - for stream_name, stats in self.table_statistics.items(): - result.extend( - { - "stream_name": stream_name, - "table_name": stats.table_name, - "column_name": col.column_name, - "null_count": col.null_count, - "non_null_count": col.non_null_count, - "total_count": col.total_count, - } - for col in stats.column_statistics - ) - return result - - # --------------------------------------------------------------------------- # Smoke test result model # --------------------------------------------------------------------------- @@ -173,15 +98,18 @@ class DestinationSmokeTestResult(BaseModel): error: str | None = None """Error message if the smoke test failed.""" - readback_result: DestinationReadbackResult | None = None - """Readback introspection result, if supported for this destination. + table_statistics: dict[str, TableStatistics] | None = None + """Map of stream name to table statistics (row counts, columns, stats). - Contains three datasets: - 1. tables - table names with row counts - 2. columns - column names and types per table - 3. column_stats - per-column null/non-null counts + Populated when readback introspection succeeds. `None` if the write + failed or the destination does not have a compatible cache. + """ - None if the write itself failed or readback is not supported. + tables_not_found: list[str] | None = None + """Stream names for which the expected table was not found. + + Populated alongside `table_statistics`. `None` when readback was + not performed. """ @@ -313,7 +241,7 @@ def run_destination_smoke_test( introspection is automatically performed after a successful write. The readback produces stats on the written data (table row counts, column names/types, and per-column null/non-null counts) and is - included in the result as `readback_result`. + included in the result as `table_statistics` and `tables_not_found`. `destination` is a resolved `Destination` object ready for writing. @@ -374,19 +302,13 @@ def run_destination_smoke_test( elapsed = time.monotonic() - start_time # Perform readback introspection if the write succeeded - readback_result: DestinationReadbackResult | None = None + table_statistics: dict[str, TableStatistics] | None = None + tables_not_found: list[str] | None = None if success: try: cache = destination.get_sql_cache(schema_name=namespace) table_statistics = cache.fetch_table_statistics(stream_names) - tables_missing = [name for name in stream_names if name not in table_statistics] - readback_result = DestinationReadbackResult( - destination=destination.name, - namespace=namespace, - readback_supported=True, - table_statistics=table_statistics, - tables_missing=tables_missing, - ) + tables_not_found = [name for name in stream_names if name not in table_statistics] except ValueError: # destination_to_cache raises ValueError for unsupported types logger.info( @@ -408,5 +330,6 @@ def run_destination_smoke_test( scenarios_requested=scenarios_str, elapsed_seconds=round(elapsed, 2), error=error_message, - readback_result=readback_result, + table_statistics=table_statistics, + tables_not_found=tables_not_found, ) diff --git a/airbyte/mcp/local.py b/airbyte/mcp/local.py index 5dd0a06d0..06631f55f 100644 --- a/airbyte/mcp/local.py +++ b/airbyte/mcp/local.py @@ -915,7 +915,7 @@ def destination_smoke_test( # noqa: PLR0913, PLR0917 automatically performed after a successful write. The readback produces stats on the written data: table row counts, column names/types, and per-column null/non-null counts. Results are included in the response - as `readback_result`. + as `table_statistics` and `tables_not_found`. """ # Resolve destination config config_dict = resolve_connector_config( From 9e1782218bfbc9000e4929e3da981f9f5eb9a53e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:27:56 +0000 Subject: [PATCH 30/49] refactor: make tables_not_found a dict mapping stream_name to expected table_name Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index a1ef3c7af..efef27081 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -105,11 +105,12 @@ class DestinationSmokeTestResult(BaseModel): failed or the destination does not have a compatible cache. """ - tables_not_found: list[str] | None = None - """Stream names for which the expected table was not found. + tables_not_found: dict[str, str] | None = None + """Stream names whose expected tables were not found in the destination. - Populated alongside `table_statistics`. `None` when readback was - not performed. + Maps stream name to the expected (normalized) table name that was + looked up but not found. Populated alongside `table_statistics`. + `None` when readback was not performed. """ @@ -303,12 +304,16 @@ def run_destination_smoke_test( # Perform readback introspection if the write succeeded table_statistics: dict[str, TableStatistics] | None = None - tables_not_found: list[str] | None = None + tables_not_found: dict[str, str] | None = None if success: try: cache = destination.get_sql_cache(schema_name=namespace) table_statistics = cache.fetch_table_statistics(stream_names) - tables_not_found = [name for name in stream_names if name not in table_statistics] + tables_not_found = { + name: cache.processor.get_sql_table_name(name) + for name in stream_names + if name not in table_statistics + } except ValueError: # destination_to_cache raises ValueError for unsupported types logger.info( From 19045b562513ed9ea51ec1457dabb942942f0096 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:30:41 +0000 Subject: [PATCH 31/49] refactor: add Destination.is_cache_supported, remove try/except from readback Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 34 +++++++------------ .../destinations/_translate_dest_to_cache.py | 14 ++++++++ airbyte/destinations/base.py | 16 +++++++++ 3 files changed, 43 insertions(+), 21 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index efef27081..457403d88 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -305,27 +305,19 @@ def run_destination_smoke_test( # Perform readback introspection if the write succeeded table_statistics: dict[str, TableStatistics] | None = None tables_not_found: dict[str, str] | None = None - if success: - try: - cache = destination.get_sql_cache(schema_name=namespace) - table_statistics = cache.fetch_table_statistics(stream_names) - tables_not_found = { - name: cache.processor.get_sql_table_name(name) - for name in stream_names - if name not in table_statistics - } - except ValueError: - # destination_to_cache raises ValueError for unsupported types - logger.info( - "Readback not supported for destination '%s'.", - destination.name, - ) - except Exception: - logger.warning( - "Readback failed for destination '%s'.", - destination.name, - exc_info=True, - ) + if success and destination.is_cache_supported: + cache = destination.get_sql_cache(schema_name=namespace) + table_statistics = cache.fetch_table_statistics(stream_names) + tables_not_found = { + name: cache.processor.get_sql_table_name(name) + for name in stream_names + if name not in table_statistics + } + elif success: + logger.info( + "Readback not supported for destination '%s'.", + destination.name, + ) return DestinationSmokeTestResult( success=success, diff --git a/airbyte/destinations/_translate_dest_to_cache.py b/airbyte/destinations/_translate_dest_to_cache.py index 7ba8a6d64..49417675a 100644 --- a/airbyte/destinations/_translate_dest_to_cache.py +++ b/airbyte/destinations/_translate_dest_to_cache.py @@ -33,6 +33,20 @@ SNOWFLAKE_PASSWORD_SECRET_NAME = "SNOWFLAKE_PASSWORD" +_SUPPORTED_DESTINATION_TYPES: set[str] = { + "bigquery", + "duckdb", + "motherduck", + "postgres", + "snowflake", +} + + +def get_supported_destination_types() -> set[str]: + """Return the set of destination type identifiers that have cache support.""" + return _SUPPORTED_DESTINATION_TYPES + + def destination_to_cache( destination_configuration: api_util.DestinationConfiguration | dict[str, Any], *, diff --git a/airbyte/destinations/base.py b/airbyte/destinations/base.py index 9fceb2cd3..13be92a99 100644 --- a/airbyte/destinations/base.py +++ b/airbyte/destinations/base.py @@ -75,6 +75,22 @@ def _normalize_destination_name(name: str) -> str: return f"{_CANONICAL_PREFIX}{name}" return name + @property + def is_cache_supported(self) -> bool: + """Whether this destination has a compatible cache implementation. + + Returns `True` when `get_sql_cache()` is expected to succeed for + the destination's connector type. + """ + from airbyte.destinations._translate_dest_to_cache import ( # noqa: PLC0415 + get_supported_destination_types, + ) + + dest_type = self._normalize_destination_name( + self.name, + ).replace(_CANONICAL_PREFIX, "") + return dest_type in get_supported_destination_types() + def get_sql_cache( self, *, From f4252577113cd4ef822c6dd80588610595da29a4 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:33:17 +0000 Subject: [PATCH 32/49] refactor: run readback regardless of write success (partial success support) Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index 457403d88..05eb1e5e8 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -305,7 +305,7 @@ def run_destination_smoke_test( # Perform readback introspection if the write succeeded table_statistics: dict[str, TableStatistics] | None = None tables_not_found: dict[str, str] | None = None - if success and destination.is_cache_supported: + if destination.is_cache_supported: cache = destination.get_sql_cache(schema_name=namespace) table_statistics = cache.fetch_table_statistics(stream_names) tables_not_found = { @@ -313,7 +313,7 @@ def run_destination_smoke_test( for name in stream_names if name not in table_statistics } - elif success: + else: logger.info( "Readback not supported for destination '%s'.", destination.name, From 9ec7ac7a60005ca2976d98222800e1d91663b0a7 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:37:08 +0000 Subject: [PATCH 33/49] refactor: improve readback skip log message per review suggestion Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index 05eb1e5e8..f777d4741 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -315,7 +315,9 @@ def run_destination_smoke_test( } else: logger.info( - "Readback not supported for destination '%s'.", + "Skipping table and column statistics retrieval for " + "destination '%s' because no SQL interface mapping has been " + "defined.", destination.name, ) From 18e1e99bb4593d3cdaf040808fe3fc8614163c46 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:40:24 +0000 Subject: [PATCH 34/49] refactor: simplify get_sql_cache signature, remove destination_name/config/version params Co-Authored-By: AJ Steers --- airbyte/destinations/base.py | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/airbyte/destinations/base.py b/airbyte/destinations/base.py index 13be92a99..6de304d75 100644 --- a/airbyte/destinations/base.py +++ b/airbyte/destinations/base.py @@ -95,9 +95,6 @@ def get_sql_cache( self, *, schema_name: str | None = None, - destination_name: str | None = None, - destination_config: dict[str, Any] | None = None, - version: str | None = None, ) -> CacheBase: """Return a SQL Cache for querying data written by this destination. @@ -110,33 +107,16 @@ def get_sql_cache( schema_name: Override the schema/namespace on the returned cache. When `None` the cache uses the default schema from the destination config. - destination_name: The canonical destination connector name - (e.g. `destination-snowflake` or `snowflake`). When - `None`, `self.name` is used. - destination_config: The destination configuration dict. When - `None`, `self.get_config()` is used. - version: Destination version string. Currently only `"latest"` - (or `None`, which is treated as `"latest"`) is accepted. - Any other value raises `NotImplementedError`. Raises: - NotImplementedError: If `version` is not `"latest"` or `None`. ValueError: If the destination type is not supported. """ from airbyte.destinations._translate_dest_to_cache import ( # noqa: PLC0415 destination_to_cache, ) - # Version gate - future-proof the signature. - if version is not None and version != "latest": - raise NotImplementedError( - f"Only version='latest' (or None) is currently supported. " f"Got: {version!r}" - ) - - resolved_name = self._normalize_destination_name( - destination_name or self.name, - ) - config = dict(destination_config or self._hydrated_config) + resolved_name = self._normalize_destination_name(self.name) + config = dict(self._hydrated_config) # Ensure the config carries a destinationType key so that # destination_to_cache() can dispatch correctly. @@ -144,9 +124,7 @@ def get_sql_cache( dest_type = resolved_name.replace(_CANONICAL_PREFIX, "") config["destinationType"] = dest_type - cache: CacheBase = destination_to_cache(config, schema_name=schema_name) - - return cache + return destination_to_cache(config, schema_name=schema_name) def write( # noqa: PLR0912, PLR0915 # Too many arguments/statements self, From 97c48c9d57ee2fcc1801d17a9064e7e61d97eef4 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:53:16 +0000 Subject: [PATCH 35/49] refactor: move translation files from destinations/ to caches/_utils/ to resolve circular import Co-Authored-By: AJ Steers --- airbyte/caches/_utils/__init__.py | 0 .../_utils/_cache_to_dest.py} | 0 .../_utils/_dest_to_cache.py} | 0 airbyte/caches/bigquery.py | 6 +++--- airbyte/caches/duckdb.py | 2 +- airbyte/caches/motherduck.py | 4 ++-- airbyte/caches/postgres.py | 4 ++-- airbyte/caches/snowflake.py | 4 ++-- airbyte/cloud/sync_results.py | 2 +- airbyte/destinations/base.py | 4 ++-- 10 files changed, 13 insertions(+), 13 deletions(-) create mode 100644 airbyte/caches/_utils/__init__.py rename airbyte/{destinations/_translate_cache_to_dest.py => caches/_utils/_cache_to_dest.py} (100%) rename airbyte/{destinations/_translate_dest_to_cache.py => caches/_utils/_dest_to_cache.py} (100%) diff --git a/airbyte/caches/_utils/__init__.py b/airbyte/caches/_utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/airbyte/destinations/_translate_cache_to_dest.py b/airbyte/caches/_utils/_cache_to_dest.py similarity index 100% rename from airbyte/destinations/_translate_cache_to_dest.py rename to airbyte/caches/_utils/_cache_to_dest.py diff --git a/airbyte/destinations/_translate_dest_to_cache.py b/airbyte/caches/_utils/_dest_to_cache.py similarity index 100% rename from airbyte/destinations/_translate_dest_to_cache.py rename to airbyte/caches/_utils/_dest_to_cache.py diff --git a/airbyte/caches/bigquery.py b/airbyte/caches/bigquery.py index a6aaf71e1..fe5596f9f 100644 --- a/airbyte/caches/bigquery.py +++ b/airbyte/caches/bigquery.py @@ -22,13 +22,13 @@ from airbyte_api.models import DestinationBigquery from airbyte._processors.sql.bigquery import BigQueryConfig, BigQuerySqlProcessor +from airbyte.caches._utils._cache_to_dest import ( + bigquery_cache_to_destination_configuration, +) from airbyte.caches.base import ( CacheBase, ) from airbyte.constants import DEFAULT_ARROW_MAX_CHUNK_SIZE -from airbyte.destinations._translate_cache_to_dest import ( - bigquery_cache_to_destination_configuration, -) if TYPE_CHECKING: diff --git a/airbyte/caches/duckdb.py b/airbyte/caches/duckdb.py index 3f720e309..c27a661fa 100644 --- a/airbyte/caches/duckdb.py +++ b/airbyte/caches/duckdb.py @@ -23,8 +23,8 @@ from duckdb_engine import DuckDBEngineWarning from airbyte._processors.sql.duckdb import DuckDBConfig, DuckDBSqlProcessor +from airbyte.caches._utils._cache_to_dest import duckdb_cache_to_destination_configuration from airbyte.caches.base import CacheBase -from airbyte.destinations._translate_cache_to_dest import duckdb_cache_to_destination_configuration if TYPE_CHECKING: diff --git a/airbyte/caches/motherduck.py b/airbyte/caches/motherduck.py index 7029724e8..0cb6e70bb 100644 --- a/airbyte/caches/motherduck.py +++ b/airbyte/caches/motherduck.py @@ -26,10 +26,10 @@ from airbyte._processors.sql.duckdb import DuckDBConfig from airbyte._processors.sql.motherduck import MotherDuckSqlProcessor -from airbyte.caches.duckdb import DuckDBCache -from airbyte.destinations._translate_cache_to_dest import ( +from airbyte.caches._utils._cache_to_dest import ( motherduck_cache_to_destination_configuration, ) +from airbyte.caches.duckdb import DuckDBCache from airbyte.secrets import SecretString diff --git a/airbyte/caches/postgres.py b/airbyte/caches/postgres.py index e153309be..678baad8a 100644 --- a/airbyte/caches/postgres.py +++ b/airbyte/caches/postgres.py @@ -24,10 +24,10 @@ from airbyte_api.models import DestinationPostgres from airbyte._processors.sql.postgres import PostgresConfig, PostgresSqlProcessor -from airbyte.caches.base import CacheBase -from airbyte.destinations._translate_cache_to_dest import ( +from airbyte.caches._utils._cache_to_dest import ( postgres_cache_to_destination_configuration, ) +from airbyte.caches.base import CacheBase if TYPE_CHECKING: diff --git a/airbyte/caches/snowflake.py b/airbyte/caches/snowflake.py index 82e59e513..d2d9e1f4a 100644 --- a/airbyte/caches/snowflake.py +++ b/airbyte/caches/snowflake.py @@ -64,10 +64,10 @@ from airbyte_api.models import DestinationSnowflake from airbyte._processors.sql.snowflake import SnowflakeConfig, SnowflakeSqlProcessor -from airbyte.caches.base import CacheBase -from airbyte.destinations._translate_cache_to_dest import ( +from airbyte.caches._utils._cache_to_dest import ( snowflake_cache_to_destination_configuration, ) +from airbyte.caches.base import CacheBase from airbyte.shared.sql_processor import RecordDedupeMode, SqlProcessorBase diff --git a/airbyte/cloud/sync_results.py b/airbyte/cloud/sync_results.py index c59ae2012..8920b9615 100644 --- a/airbyte/cloud/sync_results.py +++ b/airbyte/cloud/sync_results.py @@ -110,9 +110,9 @@ from airbyte_cdk.utils.datetime_helpers import ab_datetime_parse from airbyte._util import api_util +from airbyte.caches._utils._dest_to_cache import destination_to_cache from airbyte.cloud.constants import FAILED_STATUSES, FINAL_STATUSES from airbyte.datasets import CachedDataset -from airbyte.destinations._translate_dest_to_cache import destination_to_cache from airbyte.exceptions import AirbyteConnectionSyncError, AirbyteConnectionSyncTimeoutError diff --git a/airbyte/destinations/base.py b/airbyte/destinations/base.py index 6de304d75..078705326 100644 --- a/airbyte/destinations/base.py +++ b/airbyte/destinations/base.py @@ -82,7 +82,7 @@ def is_cache_supported(self) -> bool: Returns `True` when `get_sql_cache()` is expected to succeed for the destination's connector type. """ - from airbyte.destinations._translate_dest_to_cache import ( # noqa: PLC0415 + from airbyte.caches._utils._dest_to_cache import ( # noqa: PLC0415 get_supported_destination_types, ) @@ -111,7 +111,7 @@ def get_sql_cache( Raises: ValueError: If the destination type is not supported. """ - from airbyte.destinations._translate_dest_to_cache import ( # noqa: PLC0415 + from airbyte.caches._utils._dest_to_cache import ( # noqa: PLC0415 destination_to_cache, ) From d894c8eebe7f4d9fcfea7368c0b821fbe262f174 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:54:31 +0000 Subject: [PATCH 36/49] fix: add copyright notice to caches/_utils/__init__.py Co-Authored-By: AJ Steers --- airbyte/caches/_utils/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/airbyte/caches/_utils/__init__.py b/airbyte/caches/_utils/__init__.py index e69de29bb..1022a6f22 100644 --- a/airbyte/caches/_utils/__init__.py +++ b/airbyte/caches/_utils/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""Cache utility modules for translating between cache and destination configurations.""" From 8576cfc38913867e68427d70fc927cf673b099ce Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 17:58:31 +0000 Subject: [PATCH 37/49] refactor: promote inline imports to top-level now that circular dep is resolved Co-Authored-By: AJ Steers --- airbyte/destinations/base.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/airbyte/destinations/base.py b/airbyte/destinations/base.py index 078705326..0c540ffd7 100644 --- a/airbyte/destinations/base.py +++ b/airbyte/destinations/base.py @@ -14,6 +14,10 @@ from airbyte._message_iterators import AirbyteMessageIterator from airbyte._util.temp_files import as_temp_files from airbyte._writers.base import AirbyteWriterInterface +from airbyte.caches._utils._dest_to_cache import ( + destination_to_cache, + get_supported_destination_types, +) from airbyte.caches.util import get_default_cache from airbyte.progress import ProgressTracker from airbyte.results import ReadResult, WriteResult @@ -82,10 +86,6 @@ def is_cache_supported(self) -> bool: Returns `True` when `get_sql_cache()` is expected to succeed for the destination's connector type. """ - from airbyte.caches._utils._dest_to_cache import ( # noqa: PLC0415 - get_supported_destination_types, - ) - dest_type = self._normalize_destination_name( self.name, ).replace(_CANONICAL_PREFIX, "") @@ -111,10 +111,6 @@ def get_sql_cache( Raises: ValueError: If the destination type is not supported. """ - from airbyte.caches._utils._dest_to_cache import ( # noqa: PLC0415 - destination_to_cache, - ) - resolved_name = self._normalize_destination_name(self.name) config = dict(self._hydrated_config) From 01a59e98a87dd50045eb8f01aefdc34d764b9f92 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:06:15 +0000 Subject: [PATCH 38/49] docs: fix stale docstrings - readback runs regardless of write success Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index f777d4741..7312fd425 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -101,8 +101,9 @@ class DestinationSmokeTestResult(BaseModel): table_statistics: dict[str, TableStatistics] | None = None """Map of stream name to table statistics (row counts, columns, stats). - Populated when readback introspection succeeds. `None` if the write - failed or the destination does not have a compatible cache. + Populated when the destination has a compatible cache, regardless of + write success (to support partial-success inspection). `None` when + the destination does not have a compatible cache. """ tables_not_found: dict[str, str] | None = None @@ -239,7 +240,8 @@ def run_destination_smoke_test( destination and returns a structured result. When the destination has a compatible cache implementation, readback - introspection is automatically performed after a successful write. + introspection is automatically performed (even on write failure, to + support partial-success inspection). The readback produces stats on the written data (table row counts, column names/types, and per-column null/non-null counts) and is included in the result as `table_statistics` and `tables_not_found`. @@ -302,7 +304,7 @@ def run_destination_smoke_test( elapsed = time.monotonic() - start_time - # Perform readback introspection if the write succeeded + # Perform readback introspection (runs even on write failure for partial-success support) table_statistics: dict[str, TableStatistics] | None = None tables_not_found: dict[str, str] | None = None if destination.is_cache_supported: From 18f980fef80062a4e61cfebf02b3b159f4f0aa8b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:12:39 +0000 Subject: [PATCH 39/49] docs: fix remaining double backticks to use Markdown style Co-Authored-By: AJ Steers --- airbyte/caches/_utils/_dest_to_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte/caches/_utils/_dest_to_cache.py b/airbyte/caches/_utils/_dest_to_cache.py index 49417675a..c5e440e11 100644 --- a/airbyte/caches/_utils/_dest_to_cache.py +++ b/airbyte/caches/_utils/_dest_to_cache.py @@ -133,8 +133,8 @@ def duckdb_destination_to_cache( db_path = destination_configuration.destination_path # The DuckDB destination Docker container mounts a host directory to - # ``/local`` inside the container. Paths written as ``/local/foo.duckdb`` - # actually live at ``/destination-duckdb/foo.duckdb`` on the + # `/local` inside the container. Paths written as `/local/foo.duckdb` + # actually live at `/destination-duckdb/foo.duckdb` on the # host. Resolve the host-side path so the cache can open the file. if db_path.startswith(("/local/", "/local\\")): from airbyte.constants import DEFAULT_PROJECT_DIR # noqa: PLC0415 From 83236948927c557f8a5008559cde8346f4fd3856 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:26:23 +0000 Subject: [PATCH 40/49] fix: wrap readback introspection in try/except to preserve structured result Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index 7312fd425..d1db5421c 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -308,13 +308,20 @@ def run_destination_smoke_test( table_statistics: dict[str, TableStatistics] | None = None tables_not_found: dict[str, str] | None = None if destination.is_cache_supported: - cache = destination.get_sql_cache(schema_name=namespace) - table_statistics = cache.fetch_table_statistics(stream_names) - tables_not_found = { - name: cache.processor.get_sql_table_name(name) - for name in stream_names - if name not in table_statistics - } + try: + cache = destination.get_sql_cache(schema_name=namespace) + table_statistics = cache.fetch_table_statistics(stream_names) + tables_not_found = { + name: cache.processor.get_sql_table_name(name) + for name in stream_names + if name not in table_statistics + } + except Exception: + logger.warning( + "Readback failed for destination '%s'.", + destination.name, + exc_info=True, + ) else: logger.info( "Skipping table and column statistics retrieval for " From a545987f35205bced3cfdaabee9eaef1efd87a8c Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:34:47 +0000 Subject: [PATCH 41/49] fix: dispose engine after schema_name override to fix stale schema_translate_map Co-Authored-By: AJ Steers --- airbyte/caches/_utils/_dest_to_cache.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/airbyte/caches/_utils/_dest_to_cache.py b/airbyte/caches/_utils/_dest_to_cache.py index c5e440e11..ec7a15311 100644 --- a/airbyte/caches/_utils/_dest_to_cache.py +++ b/airbyte/caches/_utils/_dest_to_cache.py @@ -90,6 +90,10 @@ def destination_to_cache( cache = conversion_fn(destination_configuration) if schema_name is not None: cache.schema_name = schema_name + # Force engine re-creation so the schema_translate_map picks up + # the overridden schema_name (the engine is lazily cached during + # __init__ with the original schema from the destination config). + cache.processor.sql_config.dispose_engine() return cache From a6902f194f2d9b3e28ad48ddf3b0c2f9ebeac963 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:41:02 +0000 Subject: [PATCH 42/49] fix: fall back to default schema when namespace schema has no tables Some destinations (e.g. Snowflake) ignore the source namespace and write to their configured default schema. The readback now tries the namespace first, then falls back to the destination's default schema if no tables are found. Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index d1db5421c..b2f011c5e 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -309,8 +309,20 @@ def run_destination_smoke_test( tables_not_found: dict[str, str] | None = None if destination.is_cache_supported: try: + # Try the smoke-test namespace first; if no tables are found, + # fall back to the destination's default schema (some destinations + # ignore the source namespace and always write to their configured + # schema, e.g. Snowflake writes to its config `schema` field). cache = destination.get_sql_cache(schema_name=namespace) table_statistics = cache.fetch_table_statistics(stream_names) + if not table_statistics: + logger.info( + "No tables found in namespace '%s'; retrying with " + "destination default schema.", + namespace, + ) + cache = destination.get_sql_cache() + table_statistics = cache.fetch_table_statistics(stream_names) tables_not_found = { name: cache.processor.get_sql_table_name(name) for name in stream_names From 491551e18e570d065c88e0f839b0b631206fb91b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:48:19 +0000 Subject: [PATCH 43/49] fix: override destination config schema key for namespace isolation The smoke test now overrides the destination's schema/dataset config key (e.g. 'schema' for Snowflake/Postgres, 'dataset_id' for BigQuery) to the test namespace before writing. This is the 'belt and suspenders' approach: the catalog namespace is already set on each stream by the source, but some destinations (e.g. Snowflake) prioritize their config schema over the catalog namespace. Also removes the readback fallback to default schema since the config override ensures the destination writes to the correct namespace. Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 64 +++++++++++++++++++----- 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index b2f011c5e..6283c54b6 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -44,6 +44,16 @@ DEFAULT_NAMESPACE_SUFFIX = "smoke_test" """Default suffix appended when no explicit suffix is provided.""" +# Map destination types to their schema/dataset config key. +# Used to override the destination config so it writes to the test namespace. +_DESTINATION_SCHEMA_CONFIG_KEYS: dict[str, str] = { + "bigquery": "dataset_id", + "duckdb": "schema", + "motherduck": "schema", + "postgres": "schema", + "snowflake": "schema", +} + if TYPE_CHECKING: from airbyte.destinations.base import Destination @@ -214,6 +224,42 @@ def get_smoke_test_source( ) +def _apply_namespace_to_destination_config( + destination: Destination, + namespace: str, +) -> None: + """Override the schema/dataset in the destination config for namespace isolation. + + Some destinations use their config schema key (e.g. `schema` for Snowflake) + rather than the catalog namespace when deciding where to write data. + This ensures the destination writes to the smoke-test namespace by + overriding the config key directly. + + The catalog namespace is already set on each stream by the source, but + this "belt and suspenders" approach guarantees correct schema targeting + regardless of how a given destination resolves its output schema. + + Note: This mutates the destination's config in place. + """ + dest_name = destination.name + if not dest_name.startswith("destination-"): + dest_name = f"destination-{dest_name}" + dest_type = dest_name.removeprefix("destination-") + + schema_key = _DESTINATION_SCHEMA_CONFIG_KEYS.get(dest_type) + if schema_key: + config = dict(destination.get_config()) + logger.info( + "Overriding destination config '%s' from '%s' to '%s' " + "for namespace isolation.", + schema_key, + config.get(schema_key, ""), + namespace, + ) + config[schema_key] = namespace + destination.set_config(config) + + def _sanitize_error(ex: Exception) -> str: """Extract an error message from an exception without leaking secrets. @@ -287,6 +333,12 @@ def run_destination_smoke_test( else: scenarios_str = scenarios + # Override the destination config schema to write to the test namespace. + # The catalog namespace is already set on each stream by the source, + # but some destinations (e.g. Snowflake) prioritize their config schema + # over the catalog namespace. + _apply_namespace_to_destination_config(destination, namespace) + start_time = time.monotonic() success = False error_message: str | None = None @@ -309,20 +361,8 @@ def run_destination_smoke_test( tables_not_found: dict[str, str] | None = None if destination.is_cache_supported: try: - # Try the smoke-test namespace first; if no tables are found, - # fall back to the destination's default schema (some destinations - # ignore the source namespace and always write to their configured - # schema, e.g. Snowflake writes to its config `schema` field). cache = destination.get_sql_cache(schema_name=namespace) table_statistics = cache.fetch_table_statistics(stream_names) - if not table_statistics: - logger.info( - "No tables found in namespace '%s'; retrying with " - "destination default schema.", - namespace, - ) - cache = destination.get_sql_cache() - table_statistics = cache.fetch_table_statistics(stream_names) tables_not_found = { name: cache.processor.get_sql_table_name(name) for name in stream_names From 8ec0813413858265efc2ef6d46d71c0f786b0ad2 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:49:03 +0000 Subject: [PATCH 44/49] style: ruff format destination_smoke_tests.py Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index 6283c54b6..30ba23862 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -250,8 +250,7 @@ def _apply_namespace_to_destination_config( if schema_key: config = dict(destination.get_config()) logger.info( - "Overriding destination config '%s' from '%s' to '%s' " - "for namespace isolation.", + "Overriding destination config '%s' from '%s' to '%s' " "for namespace isolation.", schema_key, config.get(schema_key, ""), namespace, From 0eeb90da091721d7d8a20afc39bc74a108007f3d Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:53:36 +0000 Subject: [PATCH 45/49] fix: force disable_type_dedupe=false for readback and improve config override The smoke test now also forces disable_type_dedupe=False in the destination config so that final typed tables are created (not just raw staging). Readback introspection requires final tables to query. Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 39 ++++++++++++++++++------ 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index 30ba23862..bc01934d8 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -228,16 +228,20 @@ def _apply_namespace_to_destination_config( destination: Destination, namespace: str, ) -> None: - """Override the schema/dataset in the destination config for namespace isolation. + """Prepare the destination config for smoke testing. - Some destinations use their config schema key (e.g. `schema` for Snowflake) - rather than the catalog namespace when deciding where to write data. - This ensures the destination writes to the smoke-test namespace by - overriding the config key directly. + Applies two overrides to the destination config: - The catalog namespace is already set on each stream by the source, but - this "belt and suspenders" approach guarantees correct schema targeting - regardless of how a given destination resolves its output schema. + 1. **Schema/dataset override** — sets the config's schema key + (e.g. `schema` for Snowflake/Postgres, `dataset_id` for BigQuery) + to the test namespace. Some destinations use their config schema + rather than the catalog namespace when deciding where to write. + This "belt and suspenders" approach guarantees correct schema targeting + regardless of how a given destination resolves its output schema. + + 2. **Typing/deduplication enabled** — forces `disable_type_dedupe` + to `False` so the destination creates final typed tables (not just + raw staging). Readback introspection requires final tables. Note: This mutates the destination's config in place. """ @@ -246,16 +250,31 @@ def _apply_namespace_to_destination_config( dest_name = f"destination-{dest_name}" dest_type = dest_name.removeprefix("destination-") + config = dict(destination.get_config()) + changed = False + + # Override the schema/dataset config key to the test namespace. schema_key = _DESTINATION_SCHEMA_CONFIG_KEYS.get(dest_type) if schema_key: - config = dict(destination.get_config()) logger.info( - "Overriding destination config '%s' from '%s' to '%s' " "for namespace isolation.", + "Overriding destination config '%s' from '%s' to '%s' for namespace isolation.", schema_key, config.get(schema_key, ""), namespace, ) config[schema_key] = namespace + changed = True + + # Ensure typing and deduplication are enabled so that final tables + # (not just raw staging) are created for readback inspection. + if config.get("disable_type_dedupe"): + logger.info( + "Forcing 'disable_type_dedupe' to False so final tables are created.", + ) + config["disable_type_dedupe"] = False + changed = True + + if changed: destination.set_config(config) From c79c72e3d3e8871059d7e6e8e740cf7309f2c05e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 19:27:32 +0000 Subject: [PATCH 46/49] =?UTF-8?q?refactor:=20remove=20config=20schema=20ov?= =?UTF-8?q?erride=20=E2=80=94=20catalog=20namespace=20is=20sufficient?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Investigation confirmed that the catalog namespace (set on each stream by the source) is the primary and sufficient mechanism to direct destinations to write into the correct schema. The config-level schema key override was unnecessary and counterproductive for Postgres, where long namespace prefixes on raw table names in airbyte_internal exceed the 63-char identifier limit, causing index name collisions. Changes: - Remove _DESTINATION_SCHEMA_CONFIG_KEYS dict - Remove schema override logic from _apply_namespace_to_destination_config - Rename to _prepare_destination_config (only handles disable_type_dedupe) - Update docstrings and comments to reflect investigation findings Tested: Postgres 14/14, Snowflake 14/14, BigQuery 14/14 all pass with catalog namespace alone directing writes to the test schema. Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 59 ++++++------------------ 1 file changed, 15 insertions(+), 44 deletions(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index bc01934d8..6b68bfaf1 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -44,16 +44,6 @@ DEFAULT_NAMESPACE_SUFFIX = "smoke_test" """Default suffix appended when no explicit suffix is provided.""" -# Map destination types to their schema/dataset config key. -# Used to override the destination config so it writes to the test namespace. -_DESTINATION_SCHEMA_CONFIG_KEYS: dict[str, str] = { - "bigquery": "dataset_id", - "duckdb": "schema", - "motherduck": "schema", - "postgres": "schema", - "snowflake": "schema", -} - if TYPE_CHECKING: from airbyte.destinations.base import Destination @@ -224,47 +214,28 @@ def get_smoke_test_source( ) -def _apply_namespace_to_destination_config( +def _prepare_destination_config( destination: Destination, - namespace: str, ) -> None: """Prepare the destination config for smoke testing. - Applies two overrides to the destination config: + The catalog namespace (set on each stream by the source) is the primary + mechanism that directs destinations to write into the test schema. + Modern destinations respect the catalog namespace without needing a + config-level schema override. - 1. **Schema/dataset override** — sets the config's schema key - (e.g. `schema` for Snowflake/Postgres, `dataset_id` for BigQuery) - to the test namespace. Some destinations use their config schema - rather than the catalog namespace when deciding where to write. - This "belt and suspenders" approach guarantees correct schema targeting - regardless of how a given destination resolves its output schema. + This function applies config-level tweaks that are *not* handled by + the catalog namespace: - 2. **Typing/deduplication enabled** — forces `disable_type_dedupe` - to `False` so the destination creates final typed tables (not just - raw staging). Readback introspection requires final tables. + - **Typing/deduplication enabled** — forces `disable_type_dedupe` + to `False` so the destination creates final typed tables (not just + raw staging). Readback introspection requires final tables. Note: This mutates the destination's config in place. """ - dest_name = destination.name - if not dest_name.startswith("destination-"): - dest_name = f"destination-{dest_name}" - dest_type = dest_name.removeprefix("destination-") - config = dict(destination.get_config()) changed = False - # Override the schema/dataset config key to the test namespace. - schema_key = _DESTINATION_SCHEMA_CONFIG_KEYS.get(dest_type) - if schema_key: - logger.info( - "Overriding destination config '%s' from '%s' to '%s' for namespace isolation.", - schema_key, - config.get(schema_key, ""), - namespace, - ) - config[schema_key] = namespace - changed = True - # Ensure typing and deduplication are enabled so that final tables # (not just raw staging) are created for readback inspection. if config.get("disable_type_dedupe"): @@ -351,11 +322,11 @@ def run_destination_smoke_test( else: scenarios_str = scenarios - # Override the destination config schema to write to the test namespace. - # The catalog namespace is already set on each stream by the source, - # but some destinations (e.g. Snowflake) prioritize their config schema - # over the catalog namespace. - _apply_namespace_to_destination_config(destination, namespace) + # Prepare the destination config for smoke testing (e.g. ensure + # disable_type_dedupe is off so final tables are created for readback). + # The catalog namespace on each stream is the primary mechanism that + # directs the destination to write into the test schema. + _prepare_destination_config(destination) start_time = time.monotonic() success = False From 88f3d9ec08133bab66eadf958bc2864dc24bb5b3 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 19:36:20 +0000 Subject: [PATCH 47/49] fix: skip redundant config validation in _prepare_destination_config The config was already validated when first set on the destination. Re-validating just to toggle disable_type_dedupe adds unnecessary latency (spec fetch + JSON schema check). Pass validate=False. Co-Authored-By: AJ Steers --- airbyte/_util/destination_smoke_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte/_util/destination_smoke_tests.py b/airbyte/_util/destination_smoke_tests.py index 6b68bfaf1..8f6e6dffe 100644 --- a/airbyte/_util/destination_smoke_tests.py +++ b/airbyte/_util/destination_smoke_tests.py @@ -246,7 +246,7 @@ def _prepare_destination_config( changed = True if changed: - destination.set_config(config) + destination.set_config(config, validate=False) def _sanitize_error(ex: Exception) -> str: From 006c8912ec20f5535c10ebb25f640be1865d62cf Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 19:46:44 +0000 Subject: [PATCH 48/49] fix: use plaintext password directly in snowflake_destination_to_cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When called from the local Destination.get_sql_cache() path, the password in the hydrated config is already a plaintext value — not a secret name to look up. The previous code passed it to get_secret(), which would fail because no env var matches the literal password string. Now the else branch assigns the password directly instead of routing through get_secret(). Co-Authored-By: AJ Steers --- airbyte/caches/_utils/_dest_to_cache.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/airbyte/caches/_utils/_dest_to_cache.py b/airbyte/caches/_utils/_dest_to_cache.py index ec7a15311..16c2954a7 100644 --- a/airbyte/caches/_utils/_dest_to_cache.py +++ b/airbyte/caches/_utils/_dest_to_cache.py @@ -233,7 +233,10 @@ def snowflake_destination_to_cache( "Password is required for Snowflake cache, but it was not available." ) from ex else: - snowflake_password = get_secret(destination_password) + # The password is a plaintext value (e.g. from a local + # Destination's hydrated config). Use it directly instead + # of treating it as a secret name to look up. + snowflake_password = destination_password else: snowflake_password = get_secret(password_secret_name) From e9adb7506a30a250906a5cfa2ed7374d16d87e7e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 23 Mar 2026 19:54:16 +0000 Subject: [PATCH 49/49] fix: use destination's credentials_json for BigQuery cache construction When called from the local Destination.get_sql_cache() path, the destination config contains a plaintext credentials_json field. The previous code always looked up BIGQUERY_CREDENTIALS_PATH, which may point to GSM credentials that lack bigquery.jobs.create permission. Now bigquery_destination_to_cache() extracts credentials_json from the config when available and writes it to a temp file for the cache. Falls back to BIGQUERY_CREDENTIALS_PATH for the cloud API path (obfuscated credentials). Co-Authored-By: AJ Steers --- airbyte/caches/_utils/_dest_to_cache.py | 28 ++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/airbyte/caches/_utils/_dest_to_cache.py b/airbyte/caches/_utils/_dest_to_cache.py index 16c2954a7..a5b4700dc 100644 --- a/airbyte/caches/_utils/_dest_to_cache.py +++ b/airbyte/caches/_utils/_dest_to_cache.py @@ -3,6 +3,9 @@ from __future__ import annotations +import os +import tempfile +from pathlib import Path from typing import TYPE_CHECKING, Any from airbyte_api.models import ( @@ -104,15 +107,38 @@ def bigquery_destination_to_cache( We may have to inject credentials, because they are obfuscated when config is returned from the REST API. + + When the destination config contains a plaintext `credentials_json` field + (the local `Destination.get_sql_cache()` path), the JSON is written to a + temporary file and used directly. Otherwise we fall back to the + `BIGQUERY_CREDENTIALS_PATH` secret/env-var (the cloud API path). """ - credentials_path = get_secret("BIGQUERY_CREDENTIALS_PATH") + # Extract credentials_json before converting to the Pydantic model, + # because DestinationBigquery may strip or obfuscate the field. + raw_credentials_json: str | None = None if isinstance(destination_configuration, dict): + raw_credentials_json = destination_configuration.get("credentials_json") filtered = { k: v for k, v in destination_configuration.items() if k not in {"destinationType", "DESTINATION_TYPE"} } destination_configuration = DestinationBigquery(**filtered) + elif hasattr(destination_configuration, "credentials_json"): + raw_credentials_json = destination_configuration.credentials_json + + if raw_credentials_json and "****" not in raw_credentials_json: + # Plaintext credentials from a local destination config — write to + # a temp file so BigQueryCache can use it as a credentials path. + # The file is intentionally *not* deleted on close because the + # cache needs the path to remain valid after this function returns. + tmp_fd, tmp_path = tempfile.mkstemp(suffix=".json", prefix="bq_creds_") + Path(tmp_path).write_text(raw_credentials_json, encoding="utf-8") + # Close the file descriptor opened by mkstemp (write_text uses its own). + os.close(tmp_fd) + credentials_path = tmp_path + else: + credentials_path = get_secret("BIGQUERY_CREDENTIALS_PATH") return BigQueryCache( project_name=destination_configuration.project_id,