From 0d793f9efd4eb9ae06f052e88374d72d7a29d24a Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Thu, 18 Dec 2025 23:57:53 +0000 Subject: [PATCH 1/9] feat: Implement AI.GENERATE_EMBEDDING wrapper This change implements the `bigframes.bigquery.ai.generate_embedding` function, which wraps the BigQuery `AI.GENERATE_EMBEDDING` TVF. It supports: - Generating embeddings from DataFrames and Series. - Generating embeddings from pandas DataFrames and Series. - Specifying model name and arguments like `output_dimensionality`, `start_second`, `end_second`, and `interval_seconds`. The function is exposed in `bigframes.bigquery.ai`. Unit tests have been added to verify the generated SQL and argument mapping. --- bigframes/bigquery/_operations/ai.py | 87 ++++++++++++++++- tests/unit/bigquery/test_ai.py | 135 +++++++++++++++++++++++++++ 2 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 tests/unit/bigquery/test_ai.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index e8c28e61f5e..a2ae3044945 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -19,7 +19,7 @@ from __future__ import annotations import json -from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union +from typing import Any, Iterable, List, Literal, Mapping, Optional, Tuple, Union import pandas as pd @@ -387,6 +387,91 @@ def generate_double( return series_list[0]._apply_nary_op(operator, series_list[1:]) +@log_adapter.method_logger(custom_base_name="bigquery_ai") +def generate_embedding( + model_name: str, + data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], + *, + output_dimensionality: Optional[int] = None, + start_second: Optional[float] = None, + end_second: Optional[float] = None, + interval_seconds: Optional[float] = None, +) -> dataframe.DataFrame: + """ + Creates embeddings that describe an entity—for example, a piece of text or an image. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]}) + >>> bbq.ai.generate_embedding( + ... "project.dataset.model_name", + ... df + ... ) # doctest: +SKIP + + Args: + model_name (str): + The name of a remote model over a Vertex AI multimodalembedding@001 model. + data (DataFrame or Series): + The data to generate embeddings for. If a Series is provided, it is treated as the 'content' column. + If a DataFrame is provided, it must contain a 'content' column, or you must rename the column you wish to embed to 'content'. + output_dimensionality (int, optional): + The number of dimensions to use when generating embeddings. Valid values are 128, 256, 512, and 1408. The default value is 1408. + start_second (float, optional): + The second in the video at which to start the embedding. The default value is 0. + end_second (float, optional): + The second in the video at which to end the embedding. The default value is 120. + interval_seconds (float, optional): + The interval to use when creating embeddings. The default value is 16. + + Returns: + bigframes.dataframe.DataFrame: + A new DataFrame with the generated embeddings. It contains the input table columns and the following columns: + * "embedding": an ARRAY value that contains the generated embedding vector. + * "status": a STRING value that contains the API response status for the corresponding row. + * "video_start_sec": for video content, an INT64 value that contains the starting second. + * "video_end_sec": for video content, an INT64 value that contains the ending second. + """ + if isinstance(data, (pd.DataFrame, pd.Series)): + data = bpd.read_pandas(data) + + if isinstance(data, series.Series): + # Rename series to 'content' and convert to DataFrame + data_df = data.rename("content").to_frame() + elif isinstance(data, dataframe.DataFrame): + data_df = data + else: + raise ValueError(f"Unsupported data type: {type(data)}") + + # We need to get the SQL for the input data to pass as a subquery to the TVF + source_sql = data_df.sql + + struct_fields = [] + if output_dimensionality is not None: + struct_fields.append(f"{output_dimensionality} AS output_dimensionality") + if start_second is not None: + struct_fields.append(f"{start_second} AS start_second") + if end_second is not None: + struct_fields.append(f"{end_second} AS end_second") + if interval_seconds is not None: + struct_fields.append(f"{interval_seconds} AS interval_seconds") + + struct_args = ", ".join(struct_fields) + + # Construct the TVF query + query = f""" + SELECT * + FROM AI.GENERATE_EMBEDDING( + MODEL `{model_name}`, + ({source_sql}), + STRUCT({struct_args}) + ) + """ + + return data_df._session.read_gbq(query) + + @log_adapter.method_logger(custom_base_name="bigquery_ai") def if_( prompt: PROMPT_TYPE, diff --git a/tests/unit/bigquery/test_ai.py b/tests/unit/bigquery/test_ai.py new file mode 100644 index 00000000000..c9c046664f7 --- /dev/null +++ b/tests/unit/bigquery/test_ai.py @@ -0,0 +1,135 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pandas as pd +import pytest + +import bigframes.bigquery._operations.ai as ai_ops +import bigframes.dataframe +import bigframes.series +import bigframes.session + + +@pytest.fixture +def mock_session(): + return mock.create_autospec(spec=bigframes.session.Session) + + +@pytest.fixture +def mock_dataframe(mock_session): + df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) + df._session = mock_session + df.sql = "SELECT * FROM my_table" + return df + + +@pytest.fixture +def mock_series(mock_session): + s = mock.create_autospec(spec=bigframes.series.Series) + s._session = mock_session + # Mock to_frame to return a mock dataframe + df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) + df._session = mock_session + df.sql = "SELECT my_col AS content FROM my_table" + s.rename.return_value.to_frame.return_value = df + return s + + +def test_generate_embedding_with_dataframe(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + ai_ops.generate_embedding( + model_name, + mock_dataframe, + output_dimensionality=256, + ) + + mock_session.read_gbq.assert_called_once() + query = mock_session.read_gbq.call_args[0][0] + + # Normalize whitespace for comparison + query = " ".join(query.split()) + + expected_part_1 = "SELECT * FROM AI.GENERATE_EMBEDDING(" + expected_part_2 = f"MODEL `{model_name}`," + expected_part_3 = "(SELECT * FROM my_table)," + expected_part_4 = "STRUCT(256 AS output_dimensionality)" + + assert expected_part_1 in query + assert expected_part_2 in query + assert expected_part_3 in query + assert expected_part_4 in query + + +def test_generate_embedding_with_series(mock_series, mock_session): + model_name = "project.dataset.model" + + ai_ops.generate_embedding( + model_name, + mock_series, + start_second=0.0, + end_second=10.0, + interval_seconds=5.0 + ) + + mock_series.rename.assert_called_with("content") + mock_series.rename.return_value.to_frame.assert_called_once() + + mock_session.read_gbq.assert_called_once() + query = mock_session.read_gbq.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "(SELECT my_col AS content FROM my_table)" in query + assert "STRUCT(0.0 AS start_second, 10.0 AS end_second, 5.0 AS interval_seconds)" in query + + +def test_generate_embedding_defaults(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + ai_ops.generate_embedding( + model_name, + mock_dataframe, + ) + + mock_session.read_gbq.assert_called_once() + query = mock_session.read_gbq.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "STRUCT()" in query + + +@mock.patch("bigframes.pandas.read_pandas") +def test_generate_embedding_with_pandas_dataframe(read_pandas_mock, mock_dataframe, mock_session): + # This tests that pandas input path works and calls read_pandas + model_name = "project.dataset.model" + + # Mock return value of read_pandas to be a BigFrames DataFrame + read_pandas_mock.return_value = mock_dataframe + + pandas_df = pd.DataFrame({"content": ["test"]}) + + ai_ops.generate_embedding( + model_name, + pandas_df, + ) + + read_pandas_mock.assert_called_once() + # Check that read_pandas was called with something (the pandas df) + assert read_pandas_mock.call_args[0][0] is pandas_df + + mock_session.read_gbq.assert_called_once() From 9a774ac2cd38aa4f5e71f0bad1c8a0b01528a806 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 16 Jan 2026 21:02:28 +0000 Subject: [PATCH 2/9] update some unit tests --- bigframes/bigquery/_operations/ai.py | 62 ++++++++---- bigframes/core/pyformat.py | 3 +- bigframes/core/sql/__init__.py | 74 +------------- bigframes/core/sql/literals.py | 99 +++++++++++++++++++ bigframes/core/sql/ml.py | 7 +- tests/unit/bigquery/test_ai.py | 15 +-- .../evaluate_model_with_options.sql | 2 +- .../explain_predict_model_with_options.sql | 2 +- .../global_explain_model_with_options.sql | 2 +- .../predict_model_with_options.sql | 2 +- 10 files changed, 158 insertions(+), 110 deletions(-) create mode 100644 bigframes/core/sql/literals.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index a3cd6deac26..4811ab8e190 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -28,6 +28,7 @@ from bigframes import series, session from bigframes.core import convert from bigframes.core.logging import log_adapter +import bigframes.core.sql.literals from bigframes.ml import core as ml_core from bigframes.operations import ai_ops, output_schemas @@ -394,9 +395,11 @@ def generate_embedding( data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], *, output_dimensionality: Optional[int] = None, + task_type: Optional[str] = None, start_second: Optional[float] = None, end_second: Optional[float] = None, interval_seconds: Optional[float] = None, + trial_id: Optional[int] = None, ) -> dataframe.DataFrame: """ Creates embeddings that describe an entity—for example, a piece of text or an image. @@ -414,32 +417,49 @@ def generate_embedding( Args: model_name (str): The name of a remote model over a Vertex AI multimodalembedding@001 model. - data (DataFrame or Series): - The data to generate embeddings for. If a Series is provided, it is treated as the 'content' column. - If a DataFrame is provided, it must contain a 'content' column, or you must rename the column you wish to embed to 'content'. + data (bigframes.pandas.DataFrame or bigframes.pandas.Series): + The data to generate embeddings for. If a Series is provided, it is + treated as the 'content' column. If a DataFrame is provided, it + must contain a 'content' column, or you must rename the column you + wish to embed to 'content'. output_dimensionality (int, optional): - The number of dimensions to use when generating embeddings. Valid values are 128, 256, 512, and 1408. The default value is 1408. + An INT64 value that specifies the number of dimensions to use when + generating embeddings. For example, if you specify 256 AS + output_dimensionality, then the embedding output column contains a + 256-dimensional embedding for each input value. To find the + supported range of output dimensions, read about the available + `Google text embedding models `_. + task_type (str, optional): + A STRING literal that specifies the intended downstream application to + help the model produce better quality embeddings. For a list of + supported task types and how to choose which one to use, see `Choose an + embeddings task type `_. start_second (float, optional): The second in the video at which to start the embedding. The default value is 0. end_second (float, optional): The second in the video at which to end the embedding. The default value is 120. interval_seconds (float, optional): The interval to use when creating embeddings. The default value is 16. + trial_id (int, optional): + An INT64 value that identifies the hyperparameter tuning trial that + you want the function to evaluate. The function uses the optimal + trial by default. Only specify this argument if you ran + hyperparameter tuning when creating the model. Returns: - bigframes.dataframe.DataFrame: - A new DataFrame with the generated embeddings. It contains the input table columns and the following columns: - * "embedding": an ARRAY value that contains the generated embedding vector. - * "status": a STRING value that contains the API response status for the corresponding row. - * "video_start_sec": for video content, an INT64 value that contains the starting second. - * "video_end_sec": for video content, an INT64 value that contains the ending second. + bigframes.pandas.DataFrame: + A new DataFrame with the generated embeddings. See the `SQL + reference for AI.GENERATE_EMBEDDING + `_ + for details. """ if isinstance(data, (pd.DataFrame, pd.Series)): data = bpd.read_pandas(data) if isinstance(data, series.Series): - # Rename series to 'content' and convert to DataFrame - data_df = data.rename("content").to_frame() + data = data.copy() + data.name = "content" + data_df = data.to_frame() elif isinstance(data, dataframe.DataFrame): data_df = data else: @@ -448,17 +468,19 @@ def generate_embedding( # We need to get the SQL for the input data to pass as a subquery to the TVF source_sql = data_df.sql - struct_fields = [] + struct_fields = {} if output_dimensionality is not None: - struct_fields.append(f"{output_dimensionality} AS output_dimensionality") + struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality + if task_type is not None: + struct_fields["TASK_TYPE"] = task_type if start_second is not None: - struct_fields.append(f"{start_second} AS start_second") + struct_fields["START_SECOND"] = start_second if end_second is not None: - struct_fields.append(f"{end_second} AS end_second") + struct_fields["END_SECOND"] = end_second if interval_seconds is not None: - struct_fields.append(f"{interval_seconds} AS interval_seconds") - - struct_args = ", ".join(struct_fields) + struct_fields["INTERVAL_SECONDS"] = interval_seconds + if trial_id is not None: + struct_fields["TRIAL_ID"] = trial_id # Construct the TVF query query = f""" @@ -466,7 +488,7 @@ def generate_embedding( FROM AI.GENERATE_EMBEDDING( MODEL `{model_name}`, ({source_sql}), - STRUCT({struct_args}) + {bigframes.core.sql.literals.struct_literal(struct_fields)}) ) """ diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py index 8f49556ff4c..7d08dd4da74 100644 --- a/bigframes/core/pyformat.py +++ b/bigframes/core/pyformat.py @@ -28,6 +28,7 @@ from bigframes.core import utils import bigframes.core.local_data +import bigframes.core.sql.literals from bigframes.core.tools import bigquery_schema import bigframes.session @@ -120,7 +121,7 @@ def _validate_type(name: str, value: Any): supported_types = ( typing.get_args(_BQ_TABLE_TYPES) - + typing.get_args(bigframes.core.sql.SIMPLE_LITERAL_TYPES) + + typing.get_args(bigframes.core.sql.literals.SIMPLE_LITERAL_TYPES) + (bigframes.dataframe.DataFrame,) + (pandas.DataFrame,) ) diff --git a/bigframes/core/sql/__init__.py b/bigframes/core/sql/__init__.py index ccd2a16ddcd..521c13c6bdf 100644 --- a/bigframes/core/sql/__init__.py +++ b/bigframes/core/sql/__init__.py @@ -17,15 +17,11 @@ Utility functions for SQL construction. """ -import datetime -import decimal import json -import math from typing import cast, Collection, Iterable, Mapping, Optional, TYPE_CHECKING, Union -import shapely.geometry.base # type: ignore - import bigframes.core.compile.googlesql as googlesql +from bigframes.core.sql.literals import simple_literal if TYPE_CHECKING: import google.cloud.bigquery as bigquery @@ -33,75 +29,7 @@ import bigframes.core.ordering -# shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0. -try: - from shapely.io import to_wkt # type: ignore -except ImportError: - from shapely.wkt import dumps # type: ignore - - to_wkt = dumps - - -SIMPLE_LITERAL_TYPES = Union[ - bytes, - str, - int, - bool, - float, - datetime.datetime, - datetime.date, - datetime.time, - decimal.Decimal, - list, -] - - ### Writing SQL Values (literals, column references, table references, etc.) -def simple_literal(value: Union[SIMPLE_LITERAL_TYPES, None]) -> str: - """Return quoted input string.""" - - # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals - if value is None: - return "NULL" - elif isinstance(value, str): - # Single quoting seems to work nicer with ibis than double quoting - return f"'{googlesql._escape_chars(value)}'" - elif isinstance(value, bytes): - return repr(value) - elif isinstance(value, (bool, int)): - return str(value) - elif isinstance(value, float): - # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#floating_point_literals - if math.isnan(value): - return 'CAST("nan" as FLOAT)' - if value == math.inf: - return 'CAST("+inf" as FLOAT)' - if value == -math.inf: - return 'CAST("-inf" as FLOAT)' - return str(value) - # Check datetime first as it is a subclass of date - elif isinstance(value, datetime.datetime): - if value.tzinfo is None: - return f"DATETIME('{value.isoformat()}')" - else: - return f"TIMESTAMP('{value.isoformat()}')" - elif isinstance(value, datetime.date): - return f"DATE('{value.isoformat()}')" - elif isinstance(value, datetime.time): - return f"TIME(DATETIME('1970-01-01 {value.isoformat()}'))" - elif isinstance(value, shapely.geometry.base.BaseGeometry): - return f"ST_GEOGFROMTEXT({simple_literal(to_wkt(value))})" - elif isinstance(value, decimal.Decimal): - # TODO: disambiguate BIGNUMERIC based on scale and/or precision - return f"CAST('{str(value)}' AS NUMERIC)" - elif isinstance(value, list): - simple_literals = [simple_literal(i) for i in value] - return f"[{', '.join(simple_literals)}]" - - else: - raise ValueError(f"Cannot produce literal for {value}") - - def multi_literal(*values: str): literal_strings = [simple_literal(i) for i in values] return "(" + ", ".join(literal_strings) + ")" diff --git a/bigframes/core/sql/literals.py b/bigframes/core/sql/literals.py new file mode 100644 index 00000000000..b9db3590c16 --- /dev/null +++ b/bigframes/core/sql/literals.py @@ -0,0 +1,99 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import datetime +import decimal +import math +from typing import Mapping, Union + +import shapely.geometry.base # type: ignore + +import bigframes.core.compile.googlesql as googlesql + +# shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0. +try: + from shapely.io import to_wkt # type: ignore +except ImportError: + from shapely.wkt import dumps # type: ignore + + to_wkt = dumps + + +SIMPLE_LITERAL_TYPES = Union[ + bytes, + str, + int, + bool, + float, + datetime.datetime, + datetime.date, + datetime.time, + decimal.Decimal, + list, +] + + +def simple_literal(value: Union[SIMPLE_LITERAL_TYPES, None]) -> str: + """Return quoted input string.""" + + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals + if value is None: + return "NULL" + elif isinstance(value, str): + # Single quoting seems to work nicer with ibis than double quoting + return f"'{googlesql._escape_chars(value)}'" + elif isinstance(value, bytes): + return repr(value) + elif isinstance(value, (bool, int)): + return str(value) + elif isinstance(value, float): + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#floating_point_literals + if math.isnan(value): + return 'CAST("nan" as FLOAT)' + if value == math.inf: + return 'CAST("+inf" as FLOAT)' + if value == -math.inf: + return 'CAST("-inf" as FLOAT)' + return str(value) + # Check datetime first as it is a subclass of date + elif isinstance(value, datetime.datetime): + if value.tzinfo is None: + return f"DATETIME('{value.isoformat()}')" + else: + return f"TIMESTAMP('{value.isoformat()}')" + elif isinstance(value, datetime.date): + return f"DATE('{value.isoformat()}')" + elif isinstance(value, datetime.time): + return f"TIME(DATETIME('1970-01-01 {value.isoformat()}'))" + elif isinstance(value, shapely.geometry.base.BaseGeometry): + return f"ST_GEOGFROMTEXT({simple_literal(to_wkt(value))})" + elif isinstance(value, decimal.Decimal): + # TODO: disambiguate BIGNUMERIC based on scale and/or precision + return f"CAST('{str(value)}' AS NUMERIC)" + elif isinstance(value, list): + simple_literals = [simple_literal(i) for i in value] + return f"[{', '.join(simple_literals)}]" + + else: + raise ValueError(f"Cannot produce literal for {value}") + + +def struct_literal(struct_options: Mapping[str, SIMPLE_LITERAL_TYPES]) -> str: + rendered_options = [] + for option_name, option_value in struct_options.items(): + rendered_val = simple_literal(option_value) + rendered_options.append(f"{rendered_val} AS {option_name}") + return f"STRUCT({', '.join(rendered_options)})" diff --git a/bigframes/core/sql/ml.py b/bigframes/core/sql/ml.py index ec55fe04269..31102ddd3c4 100644 --- a/bigframes/core/sql/ml.py +++ b/bigframes/core/sql/ml.py @@ -18,6 +18,7 @@ import bigframes.core.compile.googlesql as googlesql import bigframes.core.sql +import bigframes.core.sql.literals def create_model_ddl( @@ -105,11 +106,7 @@ def _build_struct_sql( if not struct_options: return "" - rendered_options = [] - for option_name, option_value in struct_options.items(): - rendered_val = bigframes.core.sql.simple_literal(option_value) - rendered_options.append(f"{rendered_val} AS {option_name}") - return f", STRUCT({', '.join(rendered_options)})" + return f", {bigframes.core.sql.literals.struct_literal}" def evaluate( diff --git a/tests/unit/bigquery/test_ai.py b/tests/unit/bigquery/test_ai.py index c9c046664f7..e3bc7d69d32 100644 --- a/tests/unit/bigquery/test_ai.py +++ b/tests/unit/bigquery/test_ai.py @@ -78,11 +78,7 @@ def test_generate_embedding_with_series(mock_series, mock_session): model_name = "project.dataset.model" ai_ops.generate_embedding( - model_name, - mock_series, - start_second=0.0, - end_second=10.0, - interval_seconds=5.0 + model_name, mock_series, start_second=0.0, end_second=10.0, interval_seconds=5.0 ) mock_series.rename.assert_called_with("content") @@ -94,7 +90,10 @@ def test_generate_embedding_with_series(mock_series, mock_session): assert f"MODEL `{model_name}`" in query assert "(SELECT my_col AS content FROM my_table)" in query - assert "STRUCT(0.0 AS start_second, 10.0 AS end_second, 5.0 AS interval_seconds)" in query + assert ( + "STRUCT(0.0 AS start_second, 10.0 AS end_second, 5.0 AS interval_seconds)" + in query + ) def test_generate_embedding_defaults(mock_dataframe, mock_session): @@ -114,7 +113,9 @@ def test_generate_embedding_defaults(mock_dataframe, mock_session): @mock.patch("bigframes.pandas.read_pandas") -def test_generate_embedding_with_pandas_dataframe(read_pandas_mock, mock_dataframe, mock_session): +def test_generate_embedding_with_pandas_dataframe( + read_pandas_mock, mock_dataframe, mock_session +): # This tests that pandas input path works and calls read_pandas model_name = "project.dataset.model" diff --git a/tests/unit/core/sql/snapshots/test_ml/test_evaluate_model_with_options/evaluate_model_with_options.sql b/tests/unit/core/sql/snapshots/test_ml/test_evaluate_model_with_options/evaluate_model_with_options.sql index 01eb4d37819..91d2e03696e 100644 --- a/tests/unit/core/sql/snapshots/test_ml/test_evaluate_model_with_options/evaluate_model_with_options.sql +++ b/tests/unit/core/sql/snapshots/test_ml/test_evaluate_model_with_options/evaluate_model_with_options.sql @@ -1 +1 @@ -SELECT * FROM ML.EVALUATE(MODEL `my_model`, STRUCT(False AS perform_aggregation, 10 AS horizon, 0.95 AS confidence_level)) +SELECT * FROM ML.EVALUATE(MODEL `my_model`, ) diff --git a/tests/unit/core/sql/snapshots/test_ml/test_explain_predict_model_with_options/explain_predict_model_with_options.sql b/tests/unit/core/sql/snapshots/test_ml/test_explain_predict_model_with_options/explain_predict_model_with_options.sql index 1214bba8706..c8e1fa555fe 100644 --- a/tests/unit/core/sql/snapshots/test_ml/test_explain_predict_model_with_options/explain_predict_model_with_options.sql +++ b/tests/unit/core/sql/snapshots/test_ml/test_explain_predict_model_with_options/explain_predict_model_with_options.sql @@ -1 +1 @@ -SELECT * FROM ML.EXPLAIN_PREDICT(MODEL `my_model`, (SELECT * FROM new_data), STRUCT(5 AS top_k_features)) +SELECT * FROM ML.EXPLAIN_PREDICT(MODEL `my_model`, (SELECT * FROM new_data), ) diff --git a/tests/unit/core/sql/snapshots/test_ml/test_global_explain_model_with_options/global_explain_model_with_options.sql b/tests/unit/core/sql/snapshots/test_ml/test_global_explain_model_with_options/global_explain_model_with_options.sql index 1a3baa0c13b..81c399f63ff 100644 --- a/tests/unit/core/sql/snapshots/test_ml/test_global_explain_model_with_options/global_explain_model_with_options.sql +++ b/tests/unit/core/sql/snapshots/test_ml/test_global_explain_model_with_options/global_explain_model_with_options.sql @@ -1 +1 @@ -SELECT * FROM ML.GLOBAL_EXPLAIN(MODEL `my_model`, STRUCT(True AS class_level_explain)) +SELECT * FROM ML.GLOBAL_EXPLAIN(MODEL `my_model`, ) diff --git a/tests/unit/core/sql/snapshots/test_ml/test_predict_model_with_options/predict_model_with_options.sql b/tests/unit/core/sql/snapshots/test_ml/test_predict_model_with_options/predict_model_with_options.sql index 96c8074e4c1..267815415b7 100644 --- a/tests/unit/core/sql/snapshots/test_ml/test_predict_model_with_options/predict_model_with_options.sql +++ b/tests/unit/core/sql/snapshots/test_ml/test_predict_model_with_options/predict_model_with_options.sql @@ -1 +1 @@ -SELECT * FROM ML.PREDICT(MODEL `my_model`, (SELECT * FROM new_data), STRUCT(True AS keep_original_columns)) +SELECT * FROM ML.PREDICT(MODEL `my_model`, (SELECT * FROM new_data), ) From 26201e4603b8a76a1f2c1cc654396783047e62e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 3 Feb 2026 16:53:06 +0000 Subject: [PATCH 3/9] revert move to literals submodule --- bigframes/core/pyformat.py | 3 +- bigframes/core/sql/__init__.py | 74 +++++++++++++- bigframes/core/sql/literals.py | 99 ------------------- bigframes/core/sql/ml.py | 1 - .../explain_predict_model_with_options.sql | 2 +- 5 files changed, 75 insertions(+), 104 deletions(-) delete mode 100644 bigframes/core/sql/literals.py diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py index 7d08dd4da74..8f49556ff4c 100644 --- a/bigframes/core/pyformat.py +++ b/bigframes/core/pyformat.py @@ -28,7 +28,6 @@ from bigframes.core import utils import bigframes.core.local_data -import bigframes.core.sql.literals from bigframes.core.tools import bigquery_schema import bigframes.session @@ -121,7 +120,7 @@ def _validate_type(name: str, value: Any): supported_types = ( typing.get_args(_BQ_TABLE_TYPES) - + typing.get_args(bigframes.core.sql.literals.SIMPLE_LITERAL_TYPES) + + typing.get_args(bigframes.core.sql.SIMPLE_LITERAL_TYPES) + (bigframes.dataframe.DataFrame,) + (pandas.DataFrame,) ) diff --git a/bigframes/core/sql/__init__.py b/bigframes/core/sql/__init__.py index 521c13c6bdf..ccd2a16ddcd 100644 --- a/bigframes/core/sql/__init__.py +++ b/bigframes/core/sql/__init__.py @@ -17,11 +17,15 @@ Utility functions for SQL construction. """ +import datetime +import decimal import json +import math from typing import cast, Collection, Iterable, Mapping, Optional, TYPE_CHECKING, Union +import shapely.geometry.base # type: ignore + import bigframes.core.compile.googlesql as googlesql -from bigframes.core.sql.literals import simple_literal if TYPE_CHECKING: import google.cloud.bigquery as bigquery @@ -29,7 +33,75 @@ import bigframes.core.ordering +# shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0. +try: + from shapely.io import to_wkt # type: ignore +except ImportError: + from shapely.wkt import dumps # type: ignore + + to_wkt = dumps + + +SIMPLE_LITERAL_TYPES = Union[ + bytes, + str, + int, + bool, + float, + datetime.datetime, + datetime.date, + datetime.time, + decimal.Decimal, + list, +] + + ### Writing SQL Values (literals, column references, table references, etc.) +def simple_literal(value: Union[SIMPLE_LITERAL_TYPES, None]) -> str: + """Return quoted input string.""" + + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals + if value is None: + return "NULL" + elif isinstance(value, str): + # Single quoting seems to work nicer with ibis than double quoting + return f"'{googlesql._escape_chars(value)}'" + elif isinstance(value, bytes): + return repr(value) + elif isinstance(value, (bool, int)): + return str(value) + elif isinstance(value, float): + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#floating_point_literals + if math.isnan(value): + return 'CAST("nan" as FLOAT)' + if value == math.inf: + return 'CAST("+inf" as FLOAT)' + if value == -math.inf: + return 'CAST("-inf" as FLOAT)' + return str(value) + # Check datetime first as it is a subclass of date + elif isinstance(value, datetime.datetime): + if value.tzinfo is None: + return f"DATETIME('{value.isoformat()}')" + else: + return f"TIMESTAMP('{value.isoformat()}')" + elif isinstance(value, datetime.date): + return f"DATE('{value.isoformat()}')" + elif isinstance(value, datetime.time): + return f"TIME(DATETIME('1970-01-01 {value.isoformat()}'))" + elif isinstance(value, shapely.geometry.base.BaseGeometry): + return f"ST_GEOGFROMTEXT({simple_literal(to_wkt(value))})" + elif isinstance(value, decimal.Decimal): + # TODO: disambiguate BIGNUMERIC based on scale and/or precision + return f"CAST('{str(value)}' AS NUMERIC)" + elif isinstance(value, list): + simple_literals = [simple_literal(i) for i in value] + return f"[{', '.join(simple_literals)}]" + + else: + raise ValueError(f"Cannot produce literal for {value}") + + def multi_literal(*values: str): literal_strings = [simple_literal(i) for i in values] return "(" + ", ".join(literal_strings) + ")" diff --git a/bigframes/core/sql/literals.py b/bigframes/core/sql/literals.py deleted file mode 100644 index b9db3590c16..00000000000 --- a/bigframes/core/sql/literals.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import datetime -import decimal -import math -from typing import Mapping, Union - -import shapely.geometry.base # type: ignore - -import bigframes.core.compile.googlesql as googlesql - -# shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0. -try: - from shapely.io import to_wkt # type: ignore -except ImportError: - from shapely.wkt import dumps # type: ignore - - to_wkt = dumps - - -SIMPLE_LITERAL_TYPES = Union[ - bytes, - str, - int, - bool, - float, - datetime.datetime, - datetime.date, - datetime.time, - decimal.Decimal, - list, -] - - -def simple_literal(value: Union[SIMPLE_LITERAL_TYPES, None]) -> str: - """Return quoted input string.""" - - # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals - if value is None: - return "NULL" - elif isinstance(value, str): - # Single quoting seems to work nicer with ibis than double quoting - return f"'{googlesql._escape_chars(value)}'" - elif isinstance(value, bytes): - return repr(value) - elif isinstance(value, (bool, int)): - return str(value) - elif isinstance(value, float): - # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#floating_point_literals - if math.isnan(value): - return 'CAST("nan" as FLOAT)' - if value == math.inf: - return 'CAST("+inf" as FLOAT)' - if value == -math.inf: - return 'CAST("-inf" as FLOAT)' - return str(value) - # Check datetime first as it is a subclass of date - elif isinstance(value, datetime.datetime): - if value.tzinfo is None: - return f"DATETIME('{value.isoformat()}')" - else: - return f"TIMESTAMP('{value.isoformat()}')" - elif isinstance(value, datetime.date): - return f"DATE('{value.isoformat()}')" - elif isinstance(value, datetime.time): - return f"TIME(DATETIME('1970-01-01 {value.isoformat()}'))" - elif isinstance(value, shapely.geometry.base.BaseGeometry): - return f"ST_GEOGFROMTEXT({simple_literal(to_wkt(value))})" - elif isinstance(value, decimal.Decimal): - # TODO: disambiguate BIGNUMERIC based on scale and/or precision - return f"CAST('{str(value)}' AS NUMERIC)" - elif isinstance(value, list): - simple_literals = [simple_literal(i) for i in value] - return f"[{', '.join(simple_literals)}]" - - else: - raise ValueError(f"Cannot produce literal for {value}") - - -def struct_literal(struct_options: Mapping[str, SIMPLE_LITERAL_TYPES]) -> str: - rendered_options = [] - for option_name, option_value in struct_options.items(): - rendered_val = simple_literal(option_value) - rendered_options.append(f"{rendered_val} AS {option_name}") - return f"STRUCT({', '.join(rendered_options)})" diff --git a/bigframes/core/sql/ml.py b/bigframes/core/sql/ml.py index 1fc89e8becf..d77c5aa4a0b 100644 --- a/bigframes/core/sql/ml.py +++ b/bigframes/core/sql/ml.py @@ -20,7 +20,6 @@ import bigframes.core.compile.googlesql as googlesql import bigframes.core.sql -import bigframes.core.sql.literals def create_model_ddl( diff --git a/tests/unit/core/sql/snapshots/test_ml/test_explain_predict_model_with_options/explain_predict_model_with_options.sql b/tests/unit/core/sql/snapshots/test_ml/test_explain_predict_model_with_options/explain_predict_model_with_options.sql index c8e1fa555fe..1214bba8706 100644 --- a/tests/unit/core/sql/snapshots/test_ml/test_explain_predict_model_with_options/explain_predict_model_with_options.sql +++ b/tests/unit/core/sql/snapshots/test_ml/test_explain_predict_model_with_options/explain_predict_model_with_options.sql @@ -1 +1 @@ -SELECT * FROM ML.EXPLAIN_PREDICT(MODEL `my_model`, (SELECT * FROM new_data), ) +SELECT * FROM ML.EXPLAIN_PREDICT(MODEL `my_model`, (SELECT * FROM new_data), STRUCT(5 AS top_k_features)) From 7056f4c124018c803101987a73d799aac14f03b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 3 Feb 2026 16:55:42 +0000 Subject: [PATCH 4/9] fix missing import --- bigframes/bigquery/_operations/ai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 720d6dc5b7b..7917c5c1730 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -28,7 +28,7 @@ from bigframes import series, session from bigframes.core import convert from bigframes.core.logging import log_adapter -import bigframes.core.sql.literals +import bigframes.core.sql from bigframes.ml import core as ml_core from bigframes.operations import ai_ops, output_schemas @@ -488,7 +488,7 @@ def generate_embedding( FROM AI.GENERATE_EMBEDDING( MODEL `{model_name}`, ({source_sql}), - {bigframes.core.sql.literals.struct_literal(struct_fields)}) + {bigframes.core.sql.struct_literal(struct_fields)}) ) """ From fae425deff9b58b7c694c498b64cde79b02fcfc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 3 Feb 2026 17:06:31 +0000 Subject: [PATCH 5/9] try again at literals import --- bigframes/bigquery/_operations/ai.py | 4 +- bigframes/core/sql/literals.py | 59 ++++++++++++++++++++++++++++ bigframes/core/sql/ml.py | 32 +-------------- 3 files changed, 63 insertions(+), 32 deletions(-) create mode 100644 bigframes/core/sql/literals.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 7917c5c1730..720d6dc5b7b 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -28,7 +28,7 @@ from bigframes import series, session from bigframes.core import convert from bigframes.core.logging import log_adapter -import bigframes.core.sql +import bigframes.core.sql.literals from bigframes.ml import core as ml_core from bigframes.operations import ai_ops, output_schemas @@ -488,7 +488,7 @@ def generate_embedding( FROM AI.GENERATE_EMBEDDING( MODEL `{model_name}`, ({source_sql}), - {bigframes.core.sql.struct_literal(struct_fields)}) + {bigframes.core.sql.literals.struct_literal(struct_fields)}) ) """ diff --git a/bigframes/core/sql/literals.py b/bigframes/core/sql/literals.py new file mode 100644 index 00000000000..693c9d629c2 --- /dev/null +++ b/bigframes/core/sql/literals.py @@ -0,0 +1,59 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import collections.abc +import json +from typing import Any, Dict, List, Mapping, Optional, Union + +import bigframes.core.compile.googlesql as googlesql +import bigframes.core.sql + + +def struct_literal( + struct_options: Mapping[ + str, + Union[str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]], + ] +) -> str: + rendered_options = [] + for option_name, option_value in struct_options.items(): + if option_name == "model_params": + json_str = json.dumps(option_value) + # Escape single quotes for SQL string literal + sql_json_str = json_str.replace("'", "''") + rendered_val = f"JSON'{sql_json_str}'" + elif isinstance(option_value, collections.abc.Mapping): + struct_body = ", ".join( + [ + f"{bigframes.core.sql.simple_literal(v)} AS {k}" + for k, v in option_value.items() + ] + ) + rendered_val = f"STRUCT({struct_body})" + elif isinstance(option_value, list): + rendered_val = ( + "[" + + ", ".join( + [bigframes.core.sql.simple_literal(v) for v in option_value] + ) + + "]" + ) + elif isinstance(option_value, bool): + rendered_val = str(option_value).lower() + else: + rendered_val = bigframes.core.sql.simple_literal(option_value) + rendered_options.append(f"{rendered_val} AS {option_name}") + return f"STRUCT({', '.join(rendered_options)})" diff --git a/bigframes/core/sql/ml.py b/bigframes/core/sql/ml.py index d77c5aa4a0b..5b05572b174 100644 --- a/bigframes/core/sql/ml.py +++ b/bigframes/core/sql/ml.py @@ -20,6 +20,7 @@ import bigframes.core.compile.googlesql as googlesql import bigframes.core.sql +import bigframes.core.sql.literals def create_model_ddl( @@ -109,36 +110,7 @@ def _build_struct_sql( ) -> str: if not struct_options: return "" - - rendered_options = [] - for option_name, option_value in struct_options.items(): - if option_name == "model_params": - json_str = json.dumps(option_value) - # Escape single quotes for SQL string literal - sql_json_str = json_str.replace("'", "''") - rendered_val = f"JSON'{sql_json_str}'" - elif isinstance(option_value, collections.abc.Mapping): - struct_body = ", ".join( - [ - f"{bigframes.core.sql.simple_literal(v)} AS {k}" - for k, v in option_value.items() - ] - ) - rendered_val = f"STRUCT({struct_body})" - elif isinstance(option_value, list): - rendered_val = ( - "[" - + ", ".join( - [bigframes.core.sql.simple_literal(v) for v in option_value] - ) - + "]" - ) - elif isinstance(option_value, bool): - rendered_val = str(option_value).lower() - else: - rendered_val = bigframes.core.sql.simple_literal(option_value) - rendered_options.append(f"{rendered_val} AS {option_name}") - return f", STRUCT({', '.join(rendered_options)})" + return f", {bigframes.core.sql.literals.struct_literal(struct_options)}" def evaluate( From 2624a78d8fefd485980b3e184c122fec78753ba9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 3 Feb 2026 17:28:56 +0000 Subject: [PATCH 6/9] fix tests --- tests/unit/bigquery/test_ai.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/unit/bigquery/test_ai.py b/tests/unit/bigquery/test_ai.py index e3bc7d69d32..a1507997b60 100644 --- a/tests/unit/bigquery/test_ai.py +++ b/tests/unit/bigquery/test_ai.py @@ -38,14 +38,15 @@ def mock_dataframe(mock_session): @pytest.fixture def mock_series(mock_session): - s = mock.create_autospec(spec=bigframes.series.Series) - s._session = mock_session + series = mock.create_autospec(spec=bigframes.series.Series) + series._session = mock_session # Mock to_frame to return a mock dataframe df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) df._session = mock_session df.sql = "SELECT my_col AS content FROM my_table" - s.rename.return_value.to_frame.return_value = df - return s + series.copy.return_value = series + series.to_frame.return_value = df + return series def test_generate_embedding_with_dataframe(mock_dataframe, mock_session): @@ -66,7 +67,7 @@ def test_generate_embedding_with_dataframe(mock_dataframe, mock_session): expected_part_1 = "SELECT * FROM AI.GENERATE_EMBEDDING(" expected_part_2 = f"MODEL `{model_name}`," expected_part_3 = "(SELECT * FROM my_table)," - expected_part_4 = "STRUCT(256 AS output_dimensionality)" + expected_part_4 = "STRUCT(256 AS OUTPUT_DIMENSIONALITY)" assert expected_part_1 in query assert expected_part_2 in query @@ -81,9 +82,6 @@ def test_generate_embedding_with_series(mock_series, mock_session): model_name, mock_series, start_second=0.0, end_second=10.0, interval_seconds=5.0 ) - mock_series.rename.assert_called_with("content") - mock_series.rename.return_value.to_frame.assert_called_once() - mock_session.read_gbq.assert_called_once() query = mock_session.read_gbq.call_args[0][0] query = " ".join(query.split()) @@ -91,7 +89,7 @@ def test_generate_embedding_with_series(mock_series, mock_session): assert f"MODEL `{model_name}`" in query assert "(SELECT my_col AS content FROM my_table)" in query assert ( - "STRUCT(0.0 AS start_second, 10.0 AS end_second, 5.0 AS interval_seconds)" + "STRUCT(0.0 AS START_SECOND, 10.0 AS END_SECOND, 5.0 AS INTERVAL_SECONDS)" in query ) From 93e92d92cd5ca8f186a534d3af1fcedff7ac8ad0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 3 Feb 2026 17:30:42 +0000 Subject: [PATCH 7/9] fix docs --- bigframes/bigquery/_operations/ai.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 720d6dc5b7b..96f4d667431 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -416,7 +416,8 @@ def generate_embedding( Args: model_name (str): - The name of a remote model over a Vertex AI multimodalembedding@001 model. + The name of a remote model from Vertex AI, such as the + multimodalembedding@001 model. data (bigframes.pandas.DataFrame or bigframes.pandas.Series): The data to generate embeddings for. If a Series is provided, it is treated as the 'content' column. If a DataFrame is provided, it From f09cac6163e91bd4987c2b1cc711d3de45bbfa7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 3 Feb 2026 18:45:01 +0000 Subject: [PATCH 8/9] fix lint and add imports --- bigframes/bigquery/_operations/ai.py | 2 +- bigframes/bigquery/ai.py | 2 ++ bigframes/core/sql/literals.py | 15 +++++++-------- bigframes/core/sql/ml.py | 2 -- tests/unit/bigquery/test_ai.py | 10 +++++----- 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 96f4d667431..20ec60c5b8d 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -469,7 +469,7 @@ def generate_embedding( # We need to get the SQL for the input data to pass as a subquery to the TVF source_sql = data_df.sql - struct_fields = {} + struct_fields: bigframes.core.sql.literals.STRUCT_TYPE = {} if output_dimensionality is not None: struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality if task_type is not None: diff --git a/bigframes/bigquery/ai.py b/bigframes/bigquery/ai.py index 3af52205a65..b0d9b62f9be 100644 --- a/bigframes/bigquery/ai.py +++ b/bigframes/bigquery/ai.py @@ -22,6 +22,7 @@ generate, generate_bool, generate_double, + generate_embedding, generate_int, if_, score, @@ -33,6 +34,7 @@ "generate", "generate_bool", "generate_double", + "generate_embedding", "generate_int", "if_", "score", diff --git a/bigframes/core/sql/literals.py b/bigframes/core/sql/literals.py index 693c9d629c2..0c8c78a3d92 100644 --- a/bigframes/core/sql/literals.py +++ b/bigframes/core/sql/literals.py @@ -16,18 +16,17 @@ import collections.abc import json -from typing import Any, Dict, List, Mapping, Optional, Union +from typing import Any, List, Mapping, Union -import bigframes.core.compile.googlesql as googlesql import bigframes.core.sql +STRUCT_TYPE = Mapping[ + str, + Union[str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]], +] -def struct_literal( - struct_options: Mapping[ - str, - Union[str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]], - ] -) -> str: + +def struct_literal(struct_options: STRUCT_TYPE) -> str: rendered_options = [] for option_name, option_value in struct_options.items(): if option_name == "model_params": diff --git a/bigframes/core/sql/ml.py b/bigframes/core/sql/ml.py index 5b05572b174..a2a4d32ae84 100644 --- a/bigframes/core/sql/ml.py +++ b/bigframes/core/sql/ml.py @@ -14,8 +14,6 @@ from __future__ import annotations -import collections.abc -import json from typing import Any, Dict, List, Mapping, Optional, Union import bigframes.core.compile.googlesql as googlesql diff --git a/tests/unit/bigquery/test_ai.py b/tests/unit/bigquery/test_ai.py index a1507997b60..0f9df6cc268 100644 --- a/tests/unit/bigquery/test_ai.py +++ b/tests/unit/bigquery/test_ai.py @@ -17,7 +17,7 @@ import pandas as pd import pytest -import bigframes.bigquery._operations.ai as ai_ops +import bigframes.bigquery as bbq import bigframes.dataframe import bigframes.series import bigframes.session @@ -52,7 +52,7 @@ def mock_series(mock_session): def test_generate_embedding_with_dataframe(mock_dataframe, mock_session): model_name = "project.dataset.model" - ai_ops.generate_embedding( + bbq.ai.generate_embedding( model_name, mock_dataframe, output_dimensionality=256, @@ -78,7 +78,7 @@ def test_generate_embedding_with_dataframe(mock_dataframe, mock_session): def test_generate_embedding_with_series(mock_series, mock_session): model_name = "project.dataset.model" - ai_ops.generate_embedding( + bbq.ai.generate_embedding( model_name, mock_series, start_second=0.0, end_second=10.0, interval_seconds=5.0 ) @@ -97,7 +97,7 @@ def test_generate_embedding_with_series(mock_series, mock_session): def test_generate_embedding_defaults(mock_dataframe, mock_session): model_name = "project.dataset.model" - ai_ops.generate_embedding( + bbq.ai.generate_embedding( model_name, mock_dataframe, ) @@ -122,7 +122,7 @@ def test_generate_embedding_with_pandas_dataframe( pandas_df = pd.DataFrame({"content": ["test"]}) - ai_ops.generate_embedding( + bbq.ai.generate_embedding( model_name, pandas_df, ) From e0cd6cb64f2baedb6c794d4207bf5fb7b5e3463d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 3 Feb 2026 18:48:04 +0000 Subject: [PATCH 9/9] types --- bigframes/bigquery/_operations/ai.py | 4 ++-- bigframes/core/sql/literals.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 20ec60c5b8d..17af5dd5cfb 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -19,7 +19,7 @@ from __future__ import annotations import json -from typing import Any, Iterable, List, Literal, Mapping, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, Union import pandas as pd @@ -469,7 +469,7 @@ def generate_embedding( # We need to get the SQL for the input data to pass as a subquery to the TVF source_sql = data_df.sql - struct_fields: bigframes.core.sql.literals.STRUCT_TYPE = {} + struct_fields: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = {} if output_dimensionality is not None: struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality if task_type is not None: diff --git a/bigframes/core/sql/literals.py b/bigframes/core/sql/literals.py index 0c8c78a3d92..59c81977315 100644 --- a/bigframes/core/sql/literals.py +++ b/bigframes/core/sql/literals.py @@ -20,10 +20,10 @@ import bigframes.core.sql -STRUCT_TYPE = Mapping[ - str, - Union[str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]], +STRUCT_VALUES = Union[ + str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any] ] +STRUCT_TYPE = Mapping[str, STRUCT_VALUES] def struct_literal(struct_options: STRUCT_TYPE) -> str: