diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index fd7dafe95f..17af5dd5cf 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -19,7 +19,7 @@ from __future__ import annotations import json -from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union +from typing import Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, Union import pandas as pd @@ -28,6 +28,7 @@ from bigframes import series, session from bigframes.core import convert from bigframes.core.logging import log_adapter +import bigframes.core.sql.literals from bigframes.ml import core as ml_core from bigframes.operations import ai_ops, output_schemas @@ -388,6 +389,113 @@ def generate_double( return series_list[0]._apply_nary_op(operator, series_list[1:]) +@log_adapter.method_logger(custom_base_name="bigquery_ai") +def generate_embedding( + model_name: str, + data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], + *, + output_dimensionality: Optional[int] = None, + task_type: Optional[str] = None, + start_second: Optional[float] = None, + end_second: Optional[float] = None, + interval_seconds: Optional[float] = None, + trial_id: Optional[int] = None, +) -> dataframe.DataFrame: + """ + Creates embeddings that describe an entity—for example, a piece of text or an image. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]}) + >>> bbq.ai.generate_embedding( + ... "project.dataset.model_name", + ... df + ... ) # doctest: +SKIP + + Args: + model_name (str): + The name of a remote model from Vertex AI, such as the + multimodalembedding@001 model. + data (bigframes.pandas.DataFrame or bigframes.pandas.Series): + The data to generate embeddings for. If a Series is provided, it is + treated as the 'content' column. If a DataFrame is provided, it + must contain a 'content' column, or you must rename the column you + wish to embed to 'content'. + output_dimensionality (int, optional): + An INT64 value that specifies the number of dimensions to use when + generating embeddings. For example, if you specify 256 AS + output_dimensionality, then the embedding output column contains a + 256-dimensional embedding for each input value. To find the + supported range of output dimensions, read about the available + `Google text embedding models `_. + task_type (str, optional): + A STRING literal that specifies the intended downstream application to + help the model produce better quality embeddings. For a list of + supported task types and how to choose which one to use, see `Choose an + embeddings task type `_. + start_second (float, optional): + The second in the video at which to start the embedding. The default value is 0. + end_second (float, optional): + The second in the video at which to end the embedding. The default value is 120. + interval_seconds (float, optional): + The interval to use when creating embeddings. The default value is 16. + trial_id (int, optional): + An INT64 value that identifies the hyperparameter tuning trial that + you want the function to evaluate. The function uses the optimal + trial by default. Only specify this argument if you ran + hyperparameter tuning when creating the model. + + Returns: + bigframes.pandas.DataFrame: + A new DataFrame with the generated embeddings. See the `SQL + reference for AI.GENERATE_EMBEDDING + `_ + for details. + """ + if isinstance(data, (pd.DataFrame, pd.Series)): + data = bpd.read_pandas(data) + + if isinstance(data, series.Series): + data = data.copy() + data.name = "content" + data_df = data.to_frame() + elif isinstance(data, dataframe.DataFrame): + data_df = data + else: + raise ValueError(f"Unsupported data type: {type(data)}") + + # We need to get the SQL for the input data to pass as a subquery to the TVF + source_sql = data_df.sql + + struct_fields: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = {} + if output_dimensionality is not None: + struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality + if task_type is not None: + struct_fields["TASK_TYPE"] = task_type + if start_second is not None: + struct_fields["START_SECOND"] = start_second + if end_second is not None: + struct_fields["END_SECOND"] = end_second + if interval_seconds is not None: + struct_fields["INTERVAL_SECONDS"] = interval_seconds + if trial_id is not None: + struct_fields["TRIAL_ID"] = trial_id + + # Construct the TVF query + query = f""" + SELECT * + FROM AI.GENERATE_EMBEDDING( + MODEL `{model_name}`, + ({source_sql}), + {bigframes.core.sql.literals.struct_literal(struct_fields)}) + ) + """ + + return data_df._session.read_gbq(query) + + @log_adapter.method_logger(custom_base_name="bigquery_ai") def if_( prompt: PROMPT_TYPE, diff --git a/bigframes/bigquery/ai.py b/bigframes/bigquery/ai.py index 3af52205a6..b0d9b62f9b 100644 --- a/bigframes/bigquery/ai.py +++ b/bigframes/bigquery/ai.py @@ -22,6 +22,7 @@ generate, generate_bool, generate_double, + generate_embedding, generate_int, if_, score, @@ -33,6 +34,7 @@ "generate", "generate_bool", "generate_double", + "generate_embedding", "generate_int", "if_", "score", diff --git a/bigframes/core/sql/literals.py b/bigframes/core/sql/literals.py new file mode 100644 index 0000000000..59c8197731 --- /dev/null +++ b/bigframes/core/sql/literals.py @@ -0,0 +1,58 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import collections.abc +import json +from typing import Any, List, Mapping, Union + +import bigframes.core.sql + +STRUCT_VALUES = Union[ + str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any] +] +STRUCT_TYPE = Mapping[str, STRUCT_VALUES] + + +def struct_literal(struct_options: STRUCT_TYPE) -> str: + rendered_options = [] + for option_name, option_value in struct_options.items(): + if option_name == "model_params": + json_str = json.dumps(option_value) + # Escape single quotes for SQL string literal + sql_json_str = json_str.replace("'", "''") + rendered_val = f"JSON'{sql_json_str}'" + elif isinstance(option_value, collections.abc.Mapping): + struct_body = ", ".join( + [ + f"{bigframes.core.sql.simple_literal(v)} AS {k}" + for k, v in option_value.items() + ] + ) + rendered_val = f"STRUCT({struct_body})" + elif isinstance(option_value, list): + rendered_val = ( + "[" + + ", ".join( + [bigframes.core.sql.simple_literal(v) for v in option_value] + ) + + "]" + ) + elif isinstance(option_value, bool): + rendered_val = str(option_value).lower() + else: + rendered_val = bigframes.core.sql.simple_literal(option_value) + rendered_options.append(f"{rendered_val} AS {option_name}") + return f"STRUCT({', '.join(rendered_options)})" diff --git a/bigframes/core/sql/ml.py b/bigframes/core/sql/ml.py index d77c5aa4a0..a2a4d32ae8 100644 --- a/bigframes/core/sql/ml.py +++ b/bigframes/core/sql/ml.py @@ -14,12 +14,11 @@ from __future__ import annotations -import collections.abc -import json from typing import Any, Dict, List, Mapping, Optional, Union import bigframes.core.compile.googlesql as googlesql import bigframes.core.sql +import bigframes.core.sql.literals def create_model_ddl( @@ -109,36 +108,7 @@ def _build_struct_sql( ) -> str: if not struct_options: return "" - - rendered_options = [] - for option_name, option_value in struct_options.items(): - if option_name == "model_params": - json_str = json.dumps(option_value) - # Escape single quotes for SQL string literal - sql_json_str = json_str.replace("'", "''") - rendered_val = f"JSON'{sql_json_str}'" - elif isinstance(option_value, collections.abc.Mapping): - struct_body = ", ".join( - [ - f"{bigframes.core.sql.simple_literal(v)} AS {k}" - for k, v in option_value.items() - ] - ) - rendered_val = f"STRUCT({struct_body})" - elif isinstance(option_value, list): - rendered_val = ( - "[" - + ", ".join( - [bigframes.core.sql.simple_literal(v) for v in option_value] - ) - + "]" - ) - elif isinstance(option_value, bool): - rendered_val = str(option_value).lower() - else: - rendered_val = bigframes.core.sql.simple_literal(option_value) - rendered_options.append(f"{rendered_val} AS {option_name}") - return f", STRUCT({', '.join(rendered_options)})" + return f", {bigframes.core.sql.literals.struct_literal(struct_options)}" def evaluate( diff --git a/tests/unit/bigquery/test_ai.py b/tests/unit/bigquery/test_ai.py new file mode 100644 index 0000000000..0f9df6cc26 --- /dev/null +++ b/tests/unit/bigquery/test_ai.py @@ -0,0 +1,134 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pandas as pd +import pytest + +import bigframes.bigquery as bbq +import bigframes.dataframe +import bigframes.series +import bigframes.session + + +@pytest.fixture +def mock_session(): + return mock.create_autospec(spec=bigframes.session.Session) + + +@pytest.fixture +def mock_dataframe(mock_session): + df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) + df._session = mock_session + df.sql = "SELECT * FROM my_table" + return df + + +@pytest.fixture +def mock_series(mock_session): + series = mock.create_autospec(spec=bigframes.series.Series) + series._session = mock_session + # Mock to_frame to return a mock dataframe + df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) + df._session = mock_session + df.sql = "SELECT my_col AS content FROM my_table" + series.copy.return_value = series + series.to_frame.return_value = df + return series + + +def test_generate_embedding_with_dataframe(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_embedding( + model_name, + mock_dataframe, + output_dimensionality=256, + ) + + mock_session.read_gbq.assert_called_once() + query = mock_session.read_gbq.call_args[0][0] + + # Normalize whitespace for comparison + query = " ".join(query.split()) + + expected_part_1 = "SELECT * FROM AI.GENERATE_EMBEDDING(" + expected_part_2 = f"MODEL `{model_name}`," + expected_part_3 = "(SELECT * FROM my_table)," + expected_part_4 = "STRUCT(256 AS OUTPUT_DIMENSIONALITY)" + + assert expected_part_1 in query + assert expected_part_2 in query + assert expected_part_3 in query + assert expected_part_4 in query + + +def test_generate_embedding_with_series(mock_series, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_embedding( + model_name, mock_series, start_second=0.0, end_second=10.0, interval_seconds=5.0 + ) + + mock_session.read_gbq.assert_called_once() + query = mock_session.read_gbq.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "(SELECT my_col AS content FROM my_table)" in query + assert ( + "STRUCT(0.0 AS START_SECOND, 10.0 AS END_SECOND, 5.0 AS INTERVAL_SECONDS)" + in query + ) + + +def test_generate_embedding_defaults(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_embedding( + model_name, + mock_dataframe, + ) + + mock_session.read_gbq.assert_called_once() + query = mock_session.read_gbq.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "STRUCT()" in query + + +@mock.patch("bigframes.pandas.read_pandas") +def test_generate_embedding_with_pandas_dataframe( + read_pandas_mock, mock_dataframe, mock_session +): + # This tests that pandas input path works and calls read_pandas + model_name = "project.dataset.model" + + # Mock return value of read_pandas to be a BigFrames DataFrame + read_pandas_mock.return_value = mock_dataframe + + pandas_df = pd.DataFrame({"content": ["test"]}) + + bbq.ai.generate_embedding( + model_name, + pandas_df, + ) + + read_pandas_mock.assert_called_once() + # Check that read_pandas was called with something (the pandas df) + assert read_pandas_mock.call_args[0][0] is pandas_df + + mock_session.read_gbq.assert_called_once()