From a9096fb1fd94fcdcef201f786f64de8b196ef755 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Thu, 16 Apr 2026 16:10:22 -0700 Subject: [PATCH 01/12] test remove alias describe query --- .../_internal/analyzer/select_statement.py | 34 +++++++++++++++++++ tests/integ/test_reduce_describe_query.py | 20 +++++++++++ 2 files changed, 54 insertions(+) diff --git a/src/snowflake/snowpark/_internal/analyzer/select_statement.py b/src/snowflake/snowpark/_internal/analyzer/select_statement.py index e59afd4034..af18e10ce2 100644 --- a/src/snowflake/snowpark/_internal/analyzer/select_statement.py +++ b/src/snowflake/snowpark/_internal/analyzer/select_statement.py @@ -1589,6 +1589,40 @@ def select(self, cols: List[Expression]) -> "SelectStatement": ) ) + if self._session.reduce_describe_query_enabled and self.attributes is not None: + attributes_by_name = {attr.name: attr for attr in self.attributes} + inferred_attributes: List[Attribute] = [] + assert new.projection is not None + for expr in new.projection: + source_column_name = None + projected_column_name = None + if isinstance(expr, (Attribute, UnresolvedAttribute)): + source_column_name = expr.name + projected_column_name = expr.name + elif isinstance(expr, Alias) and isinstance( + expr.child, (Attribute, UnresolvedAttribute) + ): + source_column_name = expr.child.name + projected_column_name = expr.name + else: + inferred_attributes = [] + break + + source_attr = attributes_by_name.get(source_column_name) + if source_attr is None or projected_column_name is None: + inferred_attributes = [] + break + + inferred_attributes.append( + Attribute( + projected_column_name, + source_attr.datatype, + source_attr.nullable, + ) + ) + if len(inferred_attributes) == len(new.projection): + new.attributes = inferred_attributes + new.flatten_disabled = disable_next_level_flatten assert new.projection is not None new._column_states = derive_column_states_from_subquery( diff --git a/tests/integ/test_reduce_describe_query.py b/tests/integ/test_reduce_describe_query.py index 5fe9e033b2..38e1f669c4 100644 --- a/tests/integ/test_reduce_describe_query.py +++ b/tests/integ/test_reduce_describe_query.py @@ -421,6 +421,26 @@ def test_cache_metadata_on_selectable_entity(session): _ = df.col("a") +def test_project_alias_infers_attributes_from_parent_metadata(session): + df = session.create_dataframe(["v"], schema=["c"]) + _ = df.schema + + parent_attributes = df._plan._metadata.attributes + assert parent_attributes is not None + expected_attributes = [parent_attributes[0].with_name("a2")] + + df2 = df.select(col("c").alias("a2")) + if session.reduce_describe_query_enabled: + check_attributes_equality(df2._plan._metadata.attributes, expected_attributes) + expected_describe_count = 0 + else: + assert df2._plan._metadata.attributes is None + expected_describe_count = 1 + + with SqlCounter(query_count=0, describe_count=expected_describe_count): + check_attributes_equality(df2._plan.attributes, expected_attributes) + + @pytest.mark.skipif(IS_IN_STORED_PROC, reason="Can't create a session in SP") def test_reduce_describe_query_enabled_on_session(db_parameters): with Session.builder.configs(db_parameters).create() as new_session: From 858a60988a5bbcf35e89ea8197b64cc3fdfd5ef7 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Mon, 20 Apr 2026 09:49:12 -0700 Subject: [PATCH 02/12] fix test --- tests/integ/test_cte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integ/test_cte.py b/tests/integ/test_cte.py index 09639160dc..bfa9e4cddc 100644 --- a/tests/integ/test_cte.py +++ b/tests/integ/test_cte.py @@ -259,7 +259,7 @@ def test_binary(session, type, action): def test_join_with_alias_dataframe(session): expected_describe_count = ( - 3 + 2 if (session.reduce_describe_query_enabled and session.sql_simplifier_enabled) else 4 ) From d4a92799618888576f5df65de99b29a7e5c3d338 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Mon, 20 Apr 2026 19:33:24 -0700 Subject: [PATCH 03/12] add comment --- .../_internal/analyzer/select_statement.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/snowflake/snowpark/_internal/analyzer/select_statement.py b/src/snowflake/snowpark/_internal/analyzer/select_statement.py index af18e10ce2..c175c688ad 100644 --- a/src/snowflake/snowpark/_internal/analyzer/select_statement.py +++ b/src/snowflake/snowpark/_internal/analyzer/select_statement.py @@ -1589,30 +1589,44 @@ def select(self, cols: List[Expression]) -> "SelectStatement": ) ) + # When describe reduction is on and the inner select already has resolved + # attributes, infer new.attributes for this outer select by reusing datatype and + # nullable from the subquery: (1) index attributes by name, (2) walk + # new.projection, (3) only handle plain columns or Alias(column) — anything + # else aborts without setting partial attributes, (4) map each case to an + # Attribute named for the projected column, (5) assign only if every output + # column was inferred (length matches projection). if self._session.reduce_describe_query_enabled and self.attributes is not None: + # subquery lookup by name attributes_by_name = {attr.name: attr for attr in self.attributes} inferred_attributes: List[Attribute] = [] assert new.projection is not None + # infer from each projected expression for expr in new.projection: source_column_name = None projected_column_name = None if isinstance(expr, (Attribute, UnresolvedAttribute)): + # identity projection: output name equals input column source_column_name = expr.name projected_column_name = expr.name elif isinstance(expr, Alias) and isinstance( expr.child, (Attribute, UnresolvedAttribute) ): + # rename: source column from child, output name from alias source_column_name = expr.child.name projected_column_name = expr.name else: + # non-simple expression: cannot infer types safely inferred_attributes = [] break source_attr = attributes_by_name.get(source_column_name) if source_attr is None or projected_column_name is None: + # missing subquery column for this projection — abort inferred_attributes = [] break + # projected name with subquery type and nullability inferred_attributes.append( Attribute( projected_column_name, @@ -1621,6 +1635,7 @@ def select(self, cols: List[Expression]) -> "SelectStatement": ) ) if len(inferred_attributes) == len(new.projection): + # only commit when every column was inferred new.attributes = inferred_attributes new.flatten_disabled = disable_next_level_flatten From 80cba92a511583f2319ffe7c2d32ceb5f27c6077 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Tue, 21 Apr 2026 15:59:45 -0700 Subject: [PATCH 04/12] add tests --- tests/integ/test_reduce_describe_query.py | 148 +++++++++++++++++++++- 1 file changed, 144 insertions(+), 4 deletions(-) diff --git a/tests/integ/test_reduce_describe_query.py b/tests/integ/test_reduce_describe_query.py index 38e1f669c4..d8b700e78a 100644 --- a/tests/integ/test_reduce_describe_query.py +++ b/tests/integ/test_reduce_describe_query.py @@ -2,7 +2,7 @@ # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved. # -from typing import List +from typing import Dict, List import copy import pytest @@ -10,7 +10,7 @@ from unittest.mock import patch -from snowflake.snowpark import DataFrame +from snowflake.snowpark import DataFrame, Row from snowflake.snowpark._internal.analyzer.expression import Attribute, Star from snowflake.snowpark._internal.analyzer.unary_expression import UnresolvedAlias from snowflake.snowpark._internal.analyzer.unary_plan_node import Project @@ -34,9 +34,9 @@ _PYTHON_SNOWPARK_REDUCE_DESCRIBE_QUERY_ENABLED, Session, ) -from snowflake.snowpark.types import LongType, StructField, StructType +from snowflake.snowpark.types import LongType, StringType, StructField, StructType from tests.integ.utils.sql_counter import SqlCounter -from tests.utils import IS_IN_STORED_PROC, TestData +from tests.utils import IS_IN_STORED_PROC, TestData, Utils pytestmark = [ pytest.mark.skipif( @@ -228,6 +228,10 @@ def check_attributes_equality(attrs1: List[Attribute], attrs2: List[Attribute]) assert attr1.nullable == attr2.nullable +def _attrs_by_name(parent_attributes: List[Attribute]) -> Dict[str, Attribute]: + return {attr.name: attr for attr in parent_attributes} + + def has_star_in_projection(df: DataFrame) -> bool: plan = df._plan.source_plan return isinstance(plan, Project) and any( @@ -441,6 +445,142 @@ def test_project_alias_infers_attributes_from_parent_metadata(session): check_attributes_equality(df2._plan.attributes, expected_attributes) +def test_swap_column_aliases_infers_types_from_source_names(session): + """n-1: columns a, b; n: b AS a, a AS b — metadata follows source column types.""" + df = session.create_dataframe([[1, 2]], schema=["a", "b"]) + _ = df.schema + + parent_attributes = df._plan._metadata.attributes + assert parent_attributes is not None + by_name = _attrs_by_name(parent_attributes) + expected_attributes = [ + by_name['"B"'].with_name("a"), + by_name['"A"'].with_name("b"), + ] + + df2 = df.select(col("b").alias("a"), col("a").alias("b")) + Utils.check_answer(df2, [Row(A=2, B=1)], sort=False) + + if session.reduce_describe_query_enabled: + check_attributes_equality(df2._plan._metadata.attributes, expected_attributes) + expected_describe_count = 0 + else: + assert df2._plan._metadata.attributes is None + expected_describe_count = 1 + + with SqlCounter(query_count=0, describe_count=expected_describe_count): + check_attributes_equality(df2._plan.attributes, expected_attributes) + + +def test_swap_mixed_column_types_inference_follows_source(session): + """Swapped output names must take datatypes from the referenced source column.""" + schema = StructType( + [ + StructField("a", LongType(), nullable=True), + StructField("b", StringType(), nullable=True), + ] + ) + df = session.create_dataframe([(1, "x")], schema=schema) + _ = df.schema + + parent_attributes = df._plan._metadata.attributes + assert parent_attributes is not None + by_name = _attrs_by_name(parent_attributes) + expected_attributes = [ + by_name['"B"'].with_name("a"), + by_name['"A"'].with_name("b"), + ] + + df2 = df.select(col("b").alias("a"), col("a").alias("b")) + Utils.check_answer(df2, [Row(A="x", B=1)], sort=False) + + if session.reduce_describe_query_enabled: + check_attributes_equality(df2._plan._metadata.attributes, expected_attributes) + expected_describe_count = 0 + else: + assert df2._plan._metadata.attributes is None + expected_describe_count = 1 + + with SqlCounter(query_count=0, describe_count=expected_describe_count): + check_attributes_equality(df2._plan.attributes, expected_attributes) + + +def test_column_permutation_inference_name_keyed_lookup(session): + """Non-swap rename: c->a, a->x, b->y — each output type matches its source column.""" + df = session.create_dataframe([[1, 2, 3]], schema=["a", "b", "c"]) + _ = df.schema + + parent_attributes = df._plan._metadata.attributes + assert parent_attributes is not None + by_name = _attrs_by_name(parent_attributes) + expected_attributes = [ + by_name['"C"'].with_name("a"), + by_name['"A"'].with_name("x"), + by_name['"B"'].with_name("y"), + ] + + df2 = df.select(col("c").alias("a"), col("a").alias("x"), col("b").alias("y")) + Utils.check_answer(df2, [Row(A=3, X=1, Y=2)], sort=False) + + if session.reduce_describe_query_enabled: + check_attributes_equality(df2._plan._metadata.attributes, expected_attributes) + expected_describe_count = 0 + else: + assert df2._plan._metadata.attributes is None + expected_describe_count = 1 + + with SqlCounter(query_count=0, describe_count=expected_describe_count): + check_attributes_equality(df2._plan.attributes, expected_attributes) + + +def test_chained_simple_renames_infer_from_previous_metadata(session): + """Second select's parent already has inferred attributes from the first rename.""" + df = session.create_dataframe([[10, 20]], schema=["a", "b"]) + _ = df.schema + + df1 = df.select(col("a").alias("p"), col("b").alias("q")) + if session.reduce_describe_query_enabled: + assert df1._plan._metadata.attributes is not None + mid_attrs = df1._plan._metadata.attributes + assert mid_attrs is not None or not session.reduce_describe_query_enabled + + df2 = df1.select(col("p").alias("x"), col("q").alias("y")) + _ = df1.schema + + if session.reduce_describe_query_enabled: + assert df2._plan._metadata.attributes is not None + by_mid = _attrs_by_name(df1._plan._metadata.attributes or []) + expected_attributes = [ + by_mid['"P"'].with_name("x"), + by_mid['"Q"'].with_name("y"), + ] + check_attributes_equality(df2._plan._metadata.attributes, expected_attributes) + with SqlCounter(query_count=0, describe_count=0): + check_attributes_equality(df2._plan.attributes, expected_attributes) + else: + assert df2._plan._metadata.attributes is None + with SqlCounter(query_count=0, describe_count=1): + _ = df2._plan.attributes + + +def test_non_simple_projection_skips_metadata_inference(session): + """Expressions other than plain column or simple alias(column) do not infer attributes.""" + df = session.create_dataframe([[1, 2]], schema=["a", "b"]) + _ = df.schema + + df2 = df.select((col("a") + lit(1)).alias("ap1"), "b") + + assert df2._plan._metadata.attributes is None + + with SqlCounter(query_count=0, describe_count=1): + _ = df2._plan.attributes + + df3 = df.select(col("a"), (col("b") + lit(1)).alias("b")) + assert df3._plan._metadata.attributes is None + with SqlCounter(query_count=0, describe_count=1): + _ = df3._plan.attributes + + @pytest.mark.skipif(IS_IN_STORED_PROC, reason="Can't create a session in SP") def test_reduce_describe_query_enabled_on_session(db_parameters): with Session.builder.configs(db_parameters).create() as new_session: From fed4876c5aec0051a23a1809e91a3e8be1506a2d Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Wed, 22 Apr 2026 13:30:23 -0700 Subject: [PATCH 05/12] add more test --- tests/integ/test_reduce_describe_query.py | 54 +++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/integ/test_reduce_describe_query.py b/tests/integ/test_reduce_describe_query.py index d8b700e78a..e034320ae6 100644 --- a/tests/integ/test_reduce_describe_query.py +++ b/tests/integ/test_reduce_describe_query.py @@ -581,6 +581,60 @@ def test_non_simple_projection_skips_metadata_inference(session): _ = df3._plan.attributes +def test_mixed_simple_column_and_literal_alias_still_requires_describe(session): + """Alias(Literal) is not a simple rename; inference aborts even when the first column is plain.""" + df = session.create_dataframe([[1, 2]], schema=["a", "b"]) + _ = df.schema + + df2 = df.select("a", lit(1).alias("c")) + assert df2._plan._metadata.attributes is None + + with SqlCounter(query_count=0, describe_count=1): + _ = df2._plan.attributes + + +def test_simple_column_then_complex_expression_no_partial_metadata(session): + """First column is inferable but second is not; all-or-nothing — no partial cached attributes.""" + df = session.create_dataframe([[1, 2]], schema=["a", "b"]) + _ = df.schema + + df2 = df.select("a", (col("b") + lit(1)).alias("b2")) + assert df2._plan._metadata.attributes is None + + with SqlCounter(query_count=0, describe_count=1): + _ = df2._plan.attributes + + +def test_cast_on_column_alias_still_requires_describe(session): + """Alias(Cast(...)) is not Alias(Attribute); types cannot be copied from the subquery without DESCRIBE.""" + df = session.create_dataframe([[1, 2]], schema=["a", "b"]) + _ = df.schema + + df2 = df.select(col("a").cast(LongType()).alias("a")) + assert df2._plan._metadata.attributes is None + + with SqlCounter(query_count=0, describe_count=1): + _ = df2._plan.attributes + + +def test_select_star_after_cached_parent(session): + """SELECT * after parent schema is cached: infer_metadata can copy child attributes when reduce_describe is on.""" + df = session.create_dataframe([[1, 2]], schema=["a", "b"]) + _ = df.schema + parent_attrs = df._plan._metadata.attributes + assert parent_attrs is not None + + df2 = df.select("*") + if session.reduce_describe_query_enabled: + assert df2._plan._metadata.attributes is not None + check_attributes_equality(df2._plan._metadata.attributes, parent_attrs) + else: + assert df2._plan._metadata.attributes is None + + # Resolving attributes must match the logical schema (DESCRIBE may run when reduce is off). + check_attributes_equality(df2._plan.attributes, parent_attrs) + + @pytest.mark.skipif(IS_IN_STORED_PROC, reason="Can't create a session in SP") def test_reduce_describe_query_enabled_on_session(db_parameters): with Session.builder.configs(db_parameters).create() as new_session: From 58c7a324d17f7f460c759d25021b23106e721c9c Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Fri, 24 Apr 2026 12:31:33 -0700 Subject: [PATCH 06/12] more defensive code --- .../_internal/analyzer/select_statement.py | 121 +++++++++++------- tests/integ/test_reduce_describe_query.py | 20 +++ 2 files changed, 96 insertions(+), 45 deletions(-) diff --git a/src/snowflake/snowpark/_internal/analyzer/select_statement.py b/src/snowflake/snowpark/_internal/analyzer/select_statement.py index c175c688ad..42927d0560 100644 --- a/src/snowflake/snowpark/_internal/analyzer/select_statement.py +++ b/src/snowflake/snowpark/_internal/analyzer/select_statement.py @@ -51,8 +51,10 @@ from snowflake.snowpark._internal.analyzer import analyzer_utils from snowflake.snowpark._internal.analyzer.analyzer_utils import ( + quote_name_without_upper_casing, result_scan_statement, schema_value_statement, + unquote_if_quoted, ) from snowflake.snowpark._internal.analyzer.binary_expression import And from snowflake.snowpark._internal.analyzer.expression import ( @@ -85,8 +87,10 @@ has_invalid_projection_merge_functions, ) from snowflake.snowpark._internal.utils import ( - is_sql_select_statement, + ALREADY_QUOTED, ExprAliasUpdateDict, + is_sql_select_statement, + quote_name, ) import snowflake.snowpark.context as context @@ -1591,51 +1595,71 @@ def select(self, cols: List[Expression]) -> "SelectStatement": # When describe reduction is on and the inner select already has resolved # attributes, infer new.attributes for this outer select by reusing datatype and - # nullable from the subquery: (1) index attributes by name, (2) walk - # new.projection, (3) only handle plain columns or Alias(column) — anything - # else aborts without setting partial attributes, (4) map each case to an - # Attribute named for the projected column, (5) assign only if every output - # column was inferred (length matches projection). + # nullable from the subquery: (0) skip if parent column names collide, (1) index + # attributes by normalized name, (2) walk new.projection, (3) only handle plain + # columns or Alias(column), (4) resolve source via quoted-identifier-aware lookup, + # (5) assign only if every output column was inferred (length matches projection). if self._session.reduce_describe_query_enabled and self.attributes is not None: - # subquery lookup by name - attributes_by_name = {attr.name: attr for attr in self.attributes} - inferred_attributes: List[Attribute] = [] - assert new.projection is not None - # infer from each projected expression - for expr in new.projection: - source_column_name = None - projected_column_name = None - if isinstance(expr, (Attribute, UnresolvedAttribute)): - # identity projection: output name equals input column - source_column_name = expr.name - projected_column_name = expr.name - elif isinstance(expr, Alias) and isinstance( - expr.child, (Attribute, UnresolvedAttribute) - ): - # rename: source column from child, output name from alias - source_column_name = expr.child.name - projected_column_name = expr.name - else: - # non-simple expression: cannot infer types safely - inferred_attributes = [] - break - - source_attr = attributes_by_name.get(source_column_name) - if source_attr is None or projected_column_name is None: - # missing subquery column for this projection — abort - inferred_attributes = [] - break - - # projected name with subquery type and nullability - inferred_attributes.append( - Attribute( - projected_column_name, - source_attr.datatype, - source_attr.nullable, - ) - ) - if len(inferred_attributes) == len(new.projection): - # only commit when every column was inferred + parent_attributes = self.attributes + projection = new.projection + inferred_attributes: Optional[List[Attribute]] = None + # Skip: no projection to walk (do not assert; leave new.attributes unchanged). + if projection is not None: + # Skip: duplicate output names on the parent — dict/lookup would be ambiguous. + if len(parent_attributes) == len({a.name for a in parent_attributes}): + attributes_by_normalized: Dict[str, Attribute] = {} + collision = False + for attr in parent_attributes: + key = _normalized_snowflake_identifier_key(attr.name) + existing = attributes_by_normalized.get(key) + # Skip: two parent columns normalize to the same key. + if existing is not None and existing is not attr: + collision = True + break + attributes_by_normalized[key] = attr + if not collision: + inferred_attributes = [] + for expr in projection: + source_column_name: Optional[str] = None + projected_column_name: Optional[str] = None + if isinstance(expr, (Attribute, UnresolvedAttribute)): + source_column_name = expr.name + projected_column_name = expr.name + elif isinstance(expr, Alias) and isinstance( + expr.child, (Attribute, UnresolvedAttribute) + ): + source_column_name = expr.child.name + projected_column_name = expr.name + else: + # Skip: not a plain column or Alias(Attribute|UnresolvedAttribute). + inferred_attributes = [] + break + + if ( + source_column_name is None + or projected_column_name is None + ): + # Skip: missing projected output name. + inferred_attributes = [] + break + source_attr = attributes_by_normalized.get( + _normalized_snowflake_identifier_key(source_column_name) + ) + # Skip: no parent column for this source name. + if source_attr is None: + inferred_attributes = [] + break + inferred_attributes.append( + Attribute( + projected_column_name, + source_attr.datatype, + source_attr.nullable, + ) + ) + if len(inferred_attributes) != len(projection): + # Skip: incomplete inference (includes defensive mismatch). + inferred_attributes = None + if inferred_attributes is not None: new.attributes = inferred_attributes new.flatten_disabled = disable_next_level_flatten @@ -2136,6 +2160,13 @@ class DeriveColumnDependencyError(Exception): """When deriving column dependencies from the subquery.""" +def _normalized_snowflake_identifier_key(name: str) -> str: + """Canonical quoted key: delimited identifiers preserve case; unquoted follow Snowflake uppercasing.""" + if ALREADY_QUOTED.match(name): + return quote_name_without_upper_casing(unquote_if_quoted(name)) + return quote_name(name) + + def parse_column_name( column: Expression, analyzer: "Analyzer", diff --git a/tests/integ/test_reduce_describe_query.py b/tests/integ/test_reduce_describe_query.py index e034320ae6..1774c4b824 100644 --- a/tests/integ/test_reduce_describe_query.py +++ b/tests/integ/test_reduce_describe_query.py @@ -563,6 +563,26 @@ def test_chained_simple_renames_infer_from_previous_metadata(session): _ = df2._plan.attributes +def test_quoted_case_sensitive_sql_column_metadata_inference(session): + """Delimited identifier from session.sql: chained select infers metadata without DESCRIBE.""" + df = session.sql('SELECT 1 AS "MixedCase"') + with SqlCounter(query_count=0, describe_count=1, strict=False): + _ = df.schema + + df2 = df.select(col('"MixedCase"')) + if session.reduce_describe_query_enabled: + assert df2._plan._metadata.attributes is not None + assert len(df2._plan._metadata.attributes) == 1 + assert df2._plan._metadata.attributes[0].name == '"MixedCase"' + + expected_describe = 0 if session.reduce_describe_query_enabled else 1 + with SqlCounter(query_count=0, describe_count=expected_describe): + attrs = df2._plan.attributes + assert attrs is not None + assert len(attrs) == 1 + assert attrs[0].name == '"MixedCase"' + + def test_non_simple_projection_skips_metadata_inference(session): """Expressions other than plain column or simple alias(column) do not infer attributes.""" df = session.create_dataframe([[1, 2]], schema=["a", "b"]) From 7e6042a39026c83a6400897e1a9821a1827453de Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Fri, 24 Apr 2026 16:26:59 -0700 Subject: [PATCH 07/12] add changelog and remove redundant code --- CHANGELOG.md | 4 + .../_internal/analyzer/select_statement.py | 98 +++++++++---------- 2 files changed, 51 insertions(+), 51 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 959915450f..40e64fc4e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ - Added `artifact_repository` support to `udtf_configs` in `session.read.dbapi()`, enabling users to specify a custom artifact repository (e.g. PyPI) for packages used by the internal UDTF during distributed ingestion. +#### Improvements + +- When `Session.reduce_describe_query_enabled` is enabled, fewer DESCRIBE queries are issued when the outer query only projects or renames columns from an inner subquery whose column types are already known. + #### Bug Fixes - Fixed a bug where `TRY_CAST` reader option is ignored when calling `DataFrameReader.schema().csv()`. diff --git a/src/snowflake/snowpark/_internal/analyzer/select_statement.py b/src/snowflake/snowpark/_internal/analyzer/select_statement.py index 42927d0560..6741957312 100644 --- a/src/snowflake/snowpark/_internal/analyzer/select_statement.py +++ b/src/snowflake/snowpark/_internal/analyzer/select_statement.py @@ -1606,59 +1606,55 @@ def select(self, cols: List[Expression]) -> "SelectStatement": # Skip: no projection to walk (do not assert; leave new.attributes unchanged). if projection is not None: # Skip: duplicate output names on the parent — dict/lookup would be ambiguous. - if len(parent_attributes) == len({a.name for a in parent_attributes}): - attributes_by_normalized: Dict[str, Attribute] = {} - collision = False - for attr in parent_attributes: - key = _normalized_snowflake_identifier_key(attr.name) - existing = attributes_by_normalized.get(key) - # Skip: two parent columns normalize to the same key. - if existing is not None and existing is not attr: - collision = True + attributes_by_normalized: Dict[str, Attribute] = {} + collision = False + for attr in parent_attributes: + key = _normalized_snowflake_identifier_key(attr.name) + existing = attributes_by_normalized.get(key) + # Skip: two parent columns normalize to the same key. + if existing is not None and existing is not attr: + collision = True + break + attributes_by_normalized[key] = attr + if not collision: + inferred_attributes = [] + for expr in projection: + source_column_name: Optional[str] = None + projected_column_name: Optional[str] = None + if isinstance(expr, (Attribute, UnresolvedAttribute)): + source_column_name = expr.name + projected_column_name = expr.name + elif isinstance(expr, Alias) and isinstance( + expr.child, (Attribute, UnresolvedAttribute) + ): + source_column_name = expr.child.name + projected_column_name = expr.name + else: + # Skip: not a plain column or Alias(Attribute|UnresolvedAttribute). + inferred_attributes = [] break - attributes_by_normalized[key] = attr - if not collision: - inferred_attributes = [] - for expr in projection: - source_column_name: Optional[str] = None - projected_column_name: Optional[str] = None - if isinstance(expr, (Attribute, UnresolvedAttribute)): - source_column_name = expr.name - projected_column_name = expr.name - elif isinstance(expr, Alias) and isinstance( - expr.child, (Attribute, UnresolvedAttribute) - ): - source_column_name = expr.child.name - projected_column_name = expr.name - else: - # Skip: not a plain column or Alias(Attribute|UnresolvedAttribute). - inferred_attributes = [] - break - - if ( - source_column_name is None - or projected_column_name is None - ): - # Skip: missing projected output name. - inferred_attributes = [] - break - source_attr = attributes_by_normalized.get( - _normalized_snowflake_identifier_key(source_column_name) - ) - # Skip: no parent column for this source name. - if source_attr is None: - inferred_attributes = [] - break - inferred_attributes.append( - Attribute( - projected_column_name, - source_attr.datatype, - source_attr.nullable, - ) + + if source_column_name is None or projected_column_name is None: + # Skip: missing projected output name. + inferred_attributes = [] + break + source_attr = attributes_by_normalized.get( + _normalized_snowflake_identifier_key(source_column_name) + ) + # Skip: no parent column for this source name. + if source_attr is None: + inferred_attributes = [] + break + inferred_attributes.append( + Attribute( + projected_column_name, + source_attr.datatype, + source_attr.nullable, ) - if len(inferred_attributes) != len(projection): - # Skip: incomplete inference (includes defensive mismatch). - inferred_attributes = None + ) + if len(inferred_attributes) != len(projection): + # Skip: incomplete inference (includes defensive mismatch). + inferred_attributes = None if inferred_attributes is not None: new.attributes = inferred_attributes From c3d8746cd743c9995875a69bbb0feaf2abcd0356 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Mon, 27 Apr 2026 10:37:07 -0700 Subject: [PATCH 08/12] increase coverage --- tests/integ/test_reduce_describe_query.py | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/integ/test_reduce_describe_query.py b/tests/integ/test_reduce_describe_query.py index 1774c4b824..6295516cfd 100644 --- a/tests/integ/test_reduce_describe_query.py +++ b/tests/integ/test_reduce_describe_query.py @@ -637,6 +637,32 @@ def test_cast_on_column_alias_still_requires_describe(session): _ = df2._plan.attributes +def test_select_inference_skips_on_duplicate_parent_keys_and_missing_alias_name( + session, +): + """SelectStatement.select: (1) duplicate parent output aliases — collision skips inference + on a follow-up select; DESCRIBE is not skipped when resolving schema for the duplicate-alias + frame. (2) Alias with missing output name — defensive inference abort.""" + df = session.create_dataframe([[1, 2, 3]], schema=["a", "b", "c"]) + _ = df.schema + dup = df.select((col("a") + 1).as_("b"), (col("c") + 1).as_("b")) + with SqlCounter(query_count=0, describe_count=1): + _ = dup.schema + + dup_outer = dup.select(lit(1).alias("x")) + _ = dup_outer._plan.attributes + + # Scenario B: hit missing-projected-name guard without DataFrame.resolve (which would + # quote_name(None) on the Alias). Call SelectStatement.select directly. + df2 = session.create_dataframe([[1]], schema=["a"]) + _ = df2.schema + bad = col("a").alias("out") + object.__setattr__(bad._expression, "name", None) + inner = df2._select_statement + new_ss = inner.select([bad._named()]) + assert new_ss.attributes is None + + def test_select_star_after_cached_parent(session): """SELECT * after parent schema is cached: infer_metadata can copy child attributes when reduce_describe is on.""" df = session.create_dataframe([[1, 2]], schema=["a", "b"]) From 01c423edc0dafd9e49e4b56f88c002edc187db65 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Mon, 27 Apr 2026 15:55:32 -0700 Subject: [PATCH 09/12] remove redundant code --- .../_internal/analyzer/select_statement.py | 23 ++++++------------- tests/integ/test_reduce_describe_query.py | 17 ++++++++++++++ 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/snowflake/snowpark/_internal/analyzer/select_statement.py b/src/snowflake/snowpark/_internal/analyzer/select_statement.py index 6741957312..cef131aa8c 100644 --- a/src/snowflake/snowpark/_internal/analyzer/select_statement.py +++ b/src/snowflake/snowpark/_internal/analyzer/select_statement.py @@ -51,10 +51,8 @@ from snowflake.snowpark._internal.analyzer import analyzer_utils from snowflake.snowpark._internal.analyzer.analyzer_utils import ( - quote_name_without_upper_casing, result_scan_statement, schema_value_statement, - unquote_if_quoted, ) from snowflake.snowpark._internal.analyzer.binary_expression import And from snowflake.snowpark._internal.analyzer.expression import ( @@ -87,7 +85,6 @@ has_invalid_projection_merge_functions, ) from snowflake.snowpark._internal.utils import ( - ALREADY_QUOTED, ExprAliasUpdateDict, is_sql_select_statement, quote_name, @@ -1596,9 +1593,10 @@ def select(self, cols: List[Expression]) -> "SelectStatement": # When describe reduction is on and the inner select already has resolved # attributes, infer new.attributes for this outer select by reusing datatype and # nullable from the subquery: (0) skip if parent column names collide, (1) index - # attributes by normalized name, (2) walk new.projection, (3) only handle plain - # columns or Alias(column), (4) resolve source via quoted-identifier-aware lookup, - # (5) assign only if every output column was inferred (length matches projection). + # attributes by quote_name (Snowflake identifier rules; invalid delimited forms + # raise), (2) walk new.projection, (3) only handle plain columns or Alias(column), + # (4) resolve source via the same quote_name key lookup, (5) assign only if every + # output column was inferred (length matches projection). if self._session.reduce_describe_query_enabled and self.attributes is not None: parent_attributes = self.attributes projection = new.projection @@ -1609,9 +1607,9 @@ def select(self, cols: List[Expression]) -> "SelectStatement": attributes_by_normalized: Dict[str, Attribute] = {} collision = False for attr in parent_attributes: - key = _normalized_snowflake_identifier_key(attr.name) + key = quote_name(attr.name) existing = attributes_by_normalized.get(key) - # Skip: two parent columns normalize to the same key. + # Skip: two parent columns map to the same quote_name key. if existing is not None and existing is not attr: collision = True break @@ -1639,7 +1637,7 @@ def select(self, cols: List[Expression]) -> "SelectStatement": inferred_attributes = [] break source_attr = attributes_by_normalized.get( - _normalized_snowflake_identifier_key(source_column_name) + quote_name(source_column_name) ) # Skip: no parent column for this source name. if source_attr is None: @@ -2156,13 +2154,6 @@ class DeriveColumnDependencyError(Exception): """When deriving column dependencies from the subquery.""" -def _normalized_snowflake_identifier_key(name: str) -> str: - """Canonical quoted key: delimited identifiers preserve case; unquoted follow Snowflake uppercasing.""" - if ALREADY_QUOTED.match(name): - return quote_name_without_upper_casing(unquote_if_quoted(name)) - return quote_name(name) - - def parse_column_name( column: Expression, analyzer: "Analyzer", diff --git a/tests/integ/test_reduce_describe_query.py b/tests/integ/test_reduce_describe_query.py index 6295516cfd..a9671f5749 100644 --- a/tests/integ/test_reduce_describe_query.py +++ b/tests/integ/test_reduce_describe_query.py @@ -17,8 +17,10 @@ from snowflake.snowpark._internal.analyzer.schema_utils import analyze_attributes from snowflake.snowpark._internal.utils import ( TempObjectType, + quote_name, random_name_for_temp_object, ) +from snowflake.snowpark.exceptions import SnowparkPlanException from snowflake.snowpark.functions import ( avg, col, @@ -663,6 +665,21 @@ def test_select_inference_skips_on_duplicate_parent_keys_and_missing_alias_name( assert new_ss.attributes is None +def test_quote_name_malformed_delimited_identifier_not_accepted(): + """Keys for reduce-describe attribute inference use quote_name; malformed delimited names raise (SNOW-3384967).""" + for bad in ('"ab"c"', '""col"', '"col""'): + with pytest.raises(SnowparkPlanException) as ex_info: + quote_name(bad) + assert ex_info.value.error_code == "1200" + assert "Invalid identifier" in str(ex_info.value) + + +def test_quote_name_valid_keys_for_reduce_describe_inference(): + """quote_name keys used for inference: case-sensitive quoted id; unquoted uppercased.""" + assert quote_name('"MixedCase"') == '"MixedCase"' + assert quote_name("a") == '"A"' + + def test_select_star_after_cached_parent(session): """SELECT * after parent schema is cached: infer_metadata can copy child attributes when reduce_describe is on.""" df = session.create_dataframe([[1, 2]], schema=["a", "b"]) From ccd823d59070883f20333b937e396f5875ea4d9d Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Thu, 30 Apr 2026 13:50:01 -0700 Subject: [PATCH 10/12] remove quote name --- .../_internal/analyzer/select_statement.py | 23 ++++---- tests/integ/test_reduce_describe_query.py | 56 +++++++++++++++---- 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/src/snowflake/snowpark/_internal/analyzer/select_statement.py b/src/snowflake/snowpark/_internal/analyzer/select_statement.py index cef131aa8c..40d953150c 100644 --- a/src/snowflake/snowpark/_internal/analyzer/select_statement.py +++ b/src/snowflake/snowpark/_internal/analyzer/select_statement.py @@ -87,7 +87,6 @@ from snowflake.snowpark._internal.utils import ( ExprAliasUpdateDict, is_sql_select_statement, - quote_name, ) import snowflake.snowpark.context as context @@ -1593,10 +1592,10 @@ def select(self, cols: List[Expression]) -> "SelectStatement": # When describe reduction is on and the inner select already has resolved # attributes, infer new.attributes for this outer select by reusing datatype and # nullable from the subquery: (0) skip if parent column names collide, (1) index - # attributes by quote_name (Snowflake identifier rules; invalid delimited forms - # raise), (2) walk new.projection, (3) only handle plain columns or Alias(column), - # (4) resolve source via the same quote_name key lookup, (5) assign only if every - # output column was inferred (length matches projection). + # attributes by exact parent Attribute.name, (2) walk new.projection, (3) only + # handle plain columns or Alias(column), (4) resolve source by exact string match + # of the projection source name to that name (no quote_name / normalization), + # (5) assign only if every output column was inferred (length matches projection). if self._session.reduce_describe_query_enabled and self.attributes is not None: parent_attributes = self.attributes projection = new.projection @@ -1604,16 +1603,16 @@ def select(self, cols: List[Expression]) -> "SelectStatement": # Skip: no projection to walk (do not assert; leave new.attributes unchanged). if projection is not None: # Skip: duplicate output names on the parent — dict/lookup would be ambiguous. - attributes_by_normalized: Dict[str, Attribute] = {} + attributes_by_column_name: Dict[str, Attribute] = {} collision = False for attr in parent_attributes: - key = quote_name(attr.name) - existing = attributes_by_normalized.get(key) - # Skip: two parent columns map to the same quote_name key. + key = attr.name + existing = attributes_by_column_name.get(key) + # Skip: two parent columns share the same name string. if existing is not None and existing is not attr: collision = True break - attributes_by_normalized[key] = attr + attributes_by_column_name[key] = attr if not collision: inferred_attributes = [] for expr in projection: @@ -1636,9 +1635,7 @@ def select(self, cols: List[Expression]) -> "SelectStatement": # Skip: missing projected output name. inferred_attributes = [] break - source_attr = attributes_by_normalized.get( - quote_name(source_column_name) - ) + source_attr = attributes_by_column_name.get(source_column_name) # Skip: no parent column for this source name. if source_attr is None: inferred_attributes = [] diff --git a/tests/integ/test_reduce_describe_query.py b/tests/integ/test_reduce_describe_query.py index a9671f5749..fdc2f5207a 100644 --- a/tests/integ/test_reduce_describe_query.py +++ b/tests/integ/test_reduce_describe_query.py @@ -17,7 +17,6 @@ from snowflake.snowpark._internal.analyzer.schema_utils import analyze_attributes from snowflake.snowpark._internal.utils import ( TempObjectType, - quote_name, random_name_for_temp_object, ) from snowflake.snowpark.exceptions import SnowparkPlanException @@ -665,21 +664,56 @@ def test_select_inference_skips_on_duplicate_parent_keys_and_missing_alias_name( assert new_ss.attributes is None -def test_quote_name_malformed_delimited_identifier_not_accepted(): - """Keys for reduce-describe attribute inference use quote_name; malformed delimited names raise (SNOW-3384967).""" - for bad in ('"ab"c"', '""col"', '"col""'): +def test_reduce_describe_inference_exact_column_name_matrix(session): + """SNOW-3384967: attribute inference keys match Attribute.name strings exactly (no quote_name). + + Exact spelling matches reuse metadata without DESCRIBE when reduce_describe is on; + mismatched strings skip client-side inference (DESCRIBE when resolving attributes). + """ + # Delimited identifier from SQL: exact quoted reference infers. + df_mc = session.sql('SELECT 1 AS "MixedCase"') + _ = df_mc.schema + df_mc_ok = df_mc.select(col('"MixedCase"')) + if session.reduce_describe_query_enabled: + assert df_mc_ok._plan._metadata.attributes is not None + assert len(df_mc_ok._plan._metadata.attributes) == 1 + with SqlCounter( + query_count=0, + describe_count=0 if session.reduce_describe_query_enabled else 1, + ): + _ = df_mc_ok._plan.attributes + + # Same subquery, different delimited spelling: not the same string as parent name. + df_mc_miss = df_mc.select(col('"MIXEDCASE"')) + if session.reduce_describe_query_enabled: + assert df_mc_miss._plan._metadata.attributes is None + with SqlCounter(query_count=0, describe_count=1): + _ = df_mc_miss._plan.attributes + + # create_dataframe lowercase schema: projection uses same logical column via col("a"). + df_vals = session.create_dataframe([[1]], schema=["a"]) + _ = df_vals.schema + df_vals_ok = df_vals.select(col("a")) + if session.reduce_describe_query_enabled: + assert df_vals_ok._plan._metadata.attributes is not None + with SqlCounter( + query_count=0, + describe_count=0 if session.reduce_describe_query_enabled else 1, + ): + _ = df_vals_ok._plan.attributes + + +def test_reduce_describe_inference_invalid_delimited_identifier_rejected(session): + """Malformed delimited identifiers are rejected by plan analysis (error 1200), not coerced.""" + df = session.create_dataframe([[1]], schema=["x"]) + _ = df.schema + for bad_col in (r'"col""', r'"ab"c"', r'""col"'): with pytest.raises(SnowparkPlanException) as ex_info: - quote_name(bad) + df.select(col(bad_col)).collect() assert ex_info.value.error_code == "1200" assert "Invalid identifier" in str(ex_info.value) -def test_quote_name_valid_keys_for_reduce_describe_inference(): - """quote_name keys used for inference: case-sensitive quoted id; unquoted uppercased.""" - assert quote_name('"MixedCase"') == '"MixedCase"' - assert quote_name("a") == '"A"' - - def test_select_star_after_cached_parent(session): """SELECT * after parent schema is cached: infer_metadata can copy child attributes when reduce_describe is on.""" df = session.create_dataframe([[1, 2]], schema=["a", "b"]) From 86d6ff4e7c99e6078468fc11ca43a460d4e4a5b2 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Thu, 30 Apr 2026 13:54:10 -0700 Subject: [PATCH 11/12] update changelog --- CHANGELOG.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40e64fc4e4..a77904636f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Release History +## 1.51.0 (TBD) + +### Snowpark Python API Updates + +#### Improvements + +- When `Session.reduce_describe_query_enabled` is enabled, fewer DESCRIBE queries are issued when the outer query only projects or renames columns from an inner subquery whose column types are already known. + ## 1.50.0 (2026-04-23) ### Snowpark Python API Updates @@ -8,10 +16,6 @@ - Added `artifact_repository` support to `udtf_configs` in `session.read.dbapi()`, enabling users to specify a custom artifact repository (e.g. PyPI) for packages used by the internal UDTF during distributed ingestion. -#### Improvements - -- When `Session.reduce_describe_query_enabled` is enabled, fewer DESCRIBE queries are issued when the outer query only projects or renames columns from an inner subquery whose column types are already known. - #### Bug Fixes - Fixed a bug where `TRY_CAST` reader option is ignored when calling `DataFrameReader.schema().csv()`. From aafb709672978e1c1decb8ba767b840092757895 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Thu, 30 Apr 2026 22:15:35 -0700 Subject: [PATCH 12/12] remove redundant test --- tests/integ/test_reduce_describe_query.py | 39 ----------------------- 1 file changed, 39 deletions(-) diff --git a/tests/integ/test_reduce_describe_query.py b/tests/integ/test_reduce_describe_query.py index fdc2f5207a..42842b3212 100644 --- a/tests/integ/test_reduce_describe_query.py +++ b/tests/integ/test_reduce_describe_query.py @@ -664,45 +664,6 @@ def test_select_inference_skips_on_duplicate_parent_keys_and_missing_alias_name( assert new_ss.attributes is None -def test_reduce_describe_inference_exact_column_name_matrix(session): - """SNOW-3384967: attribute inference keys match Attribute.name strings exactly (no quote_name). - - Exact spelling matches reuse metadata without DESCRIBE when reduce_describe is on; - mismatched strings skip client-side inference (DESCRIBE when resolving attributes). - """ - # Delimited identifier from SQL: exact quoted reference infers. - df_mc = session.sql('SELECT 1 AS "MixedCase"') - _ = df_mc.schema - df_mc_ok = df_mc.select(col('"MixedCase"')) - if session.reduce_describe_query_enabled: - assert df_mc_ok._plan._metadata.attributes is not None - assert len(df_mc_ok._plan._metadata.attributes) == 1 - with SqlCounter( - query_count=0, - describe_count=0 if session.reduce_describe_query_enabled else 1, - ): - _ = df_mc_ok._plan.attributes - - # Same subquery, different delimited spelling: not the same string as parent name. - df_mc_miss = df_mc.select(col('"MIXEDCASE"')) - if session.reduce_describe_query_enabled: - assert df_mc_miss._plan._metadata.attributes is None - with SqlCounter(query_count=0, describe_count=1): - _ = df_mc_miss._plan.attributes - - # create_dataframe lowercase schema: projection uses same logical column via col("a"). - df_vals = session.create_dataframe([[1]], schema=["a"]) - _ = df_vals.schema - df_vals_ok = df_vals.select(col("a")) - if session.reduce_describe_query_enabled: - assert df_vals_ok._plan._metadata.attributes is not None - with SqlCounter( - query_count=0, - describe_count=0 if session.reduce_describe_query_enabled else 1, - ): - _ = df_vals_ok._plan.attributes - - def test_reduce_describe_inference_invalid_delimited_identifier_rejected(session): """Malformed delimited identifiers are rejected by plan analysis (error 1200), not coerced.""" df = session.create_dataframe([[1]], schema=["x"])