From 86b8494b9154b9f3187f3b43528344be0b04df26 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 2 Feb 2026 10:19:40 -0800 Subject: [PATCH 1/8] chore: move bigquery/table.py to bigquery/_operations/table.py (#2423) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/bigquery/__init__.py | 2 +- bigframes/bigquery/{ => _operations}/table.py | 0 tests/unit/bigquery/test_table.py | 6 +++--- 3 files changed, 4 insertions(+), 4 deletions(-) rename bigframes/bigquery/{ => _operations}/table.py (100%) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 5728f153dd..150fe5efc0 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -60,7 +60,7 @@ from bigframes.bigquery._operations.search import create_vector_index, vector_search from bigframes.bigquery._operations.sql import sql_scalar from bigframes.bigquery._operations.struct import struct -from bigframes.bigquery.table import create_external_table +from bigframes.bigquery._operations.table import create_external_table from bigframes.core.logging import log_adapter _functions = [ diff --git a/bigframes/bigquery/table.py b/bigframes/bigquery/_operations/table.py similarity index 100% rename from bigframes/bigquery/table.py rename to bigframes/bigquery/_operations/table.py diff --git a/tests/unit/bigquery/test_table.py b/tests/unit/bigquery/test_table.py index 441130d53d..badce5e5e2 100644 --- a/tests/unit/bigquery/test_table.py +++ b/tests/unit/bigquery/test_table.py @@ -16,7 +16,7 @@ import pytest -import bigframes.bigquery.table +import bigframes.bigquery import bigframes.core.sql.table import bigframes.session @@ -80,9 +80,9 @@ def test_create_external_table_ddl_connection(): assert sql == expected -@mock.patch("bigframes.bigquery.table._get_table_metadata") +@mock.patch("bigframes.bigquery._operations.table._get_table_metadata") def test_create_external_table(get_table_metadata_mock, mock_session): - bigframes.bigquery.table.create_external_table( + bigframes.bigquery.create_external_table( "my-project.my_dataset.my_table", columns={"col1": "INT64", "col2": "STRING"}, options={"format": "CSV", "uris": ["gs://bucket/path*"]}, From 0d1359b14c67a730d822d79e274d36695ad7aef8 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 2 Feb 2026 13:46:23 -0800 Subject: [PATCH 2/8] feat: add `bigframes.pandas.options.experiments.sql_compiler` for switching the backend compiler (#2417) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds `bigframes.pandas.options.experiments.sql_compiler` to allow switching the backend compiler. Currently, the default remains set to 'legacy' (ibis), but users can now optionally switch to the 'experimental' (sqlglot) compiler. Fixes internal issue 479912001🦕 --- bigframes/_config/experiment_options.py | 21 ++++++++++++++++++- bigframes/core/compile/__init__.py | 19 +++++++++++++++-- bigframes/session/bq_caching_executor.py | 10 ++++++--- bigframes/session/direct_gbq_execution.py | 7 +++++-- tests/unit/_config/test_experiment_options.py | 15 +++++++++++++ 5 files changed, 64 insertions(+), 8 deletions(-) diff --git a/bigframes/_config/experiment_options.py b/bigframes/_config/experiment_options.py index 024de392c0..ee54e017fe 100644 --- a/bigframes/_config/experiment_options.py +++ b/bigframes/_config/experiment_options.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Literal, Optional import warnings import bigframes @@ -27,6 +27,7 @@ class ExperimentOptions: def __init__(self): self._semantic_operators: bool = False self._ai_operators: bool = False + self._sql_compiler: Literal["legacy", "stable", "experimental"] = "stable" @property def semantic_operators(self) -> bool: @@ -55,6 +56,24 @@ def ai_operators(self, value: bool): warnings.warn(msg, category=bfe.PreviewWarning) self._ai_operators = value + @property + def sql_compiler(self) -> Literal["legacy", "stable", "experimental"]: + return self._sql_compiler + + @sql_compiler.setter + def sql_compiler(self, value: Literal["legacy", "stable", "experimental"]): + if value not in ["legacy", "stable", "experimental"]: + raise ValueError( + "sql_compiler must be one of 'legacy', 'stable', or 'experimental'" + ) + if value == "experimental": + msg = bfe.format_message( + "The experimental SQL compiler is still under experiments, and is subject " + "to change in the future." + ) + warnings.warn(msg, category=FutureWarning) + self._sql_compiler = value + @property def blob(self) -> bool: msg = bfe.format_message( diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index 68c36df288..15d2d0e52c 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -13,13 +13,28 @@ # limitations under the License. from __future__ import annotations +from typing import Any + +from bigframes import options from bigframes.core.compile.api import test_only_ibis_inferred_schema from bigframes.core.compile.configs import CompileRequest, CompileResult -from bigframes.core.compile.ibis_compiler.ibis_compiler import compile_sql + + +def compiler() -> Any: + """Returns the appropriate compiler module based on session options.""" + if options.experiments.sql_compiler == "experimental": + import bigframes.core.compile.sqlglot.compiler as sqlglot_compiler + + return sqlglot_compiler + else: + import bigframes.core.compile.ibis_compiler.ibis_compiler as ibis_compiler + + return ibis_compiler + __all__ = [ "test_only_ibis_inferred_schema", - "compile_sql", "CompileRequest", "CompileResult", + "compiler", ] diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index ca19d1be86..57af7daf65 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -174,7 +174,9 @@ def to_sql( else array_value.node ) node = self._substitute_large_local_sources(node) - compiled = compile.compile_sql(compile.CompileRequest(node, sort_rows=ordered)) + compiled = compile.compiler().compile_sql( + compile.CompileRequest(node, sort_rows=ordered) + ) return compiled.sql def execute( @@ -290,7 +292,9 @@ def _export_gbq( # validate destination table existing_table = self._maybe_find_existing_table(spec) - compiled = compile.compile_sql(compile.CompileRequest(plan, sort_rows=False)) + compiled = compile.compiler().compile_sql( + compile.CompileRequest(plan, sort_rows=False) + ) sql = compiled.sql if (existing_table is not None) and _if_schema_match( @@ -641,7 +645,7 @@ def _execute_plan_gbq( ] cluster_cols = cluster_cols[:_MAX_CLUSTER_COLUMNS] - compiled = compile.compile_sql( + compiled = compile.compiler().compile_sql( compile.CompileRequest( plan, sort_rows=ordered, diff --git a/bigframes/session/direct_gbq_execution.py b/bigframes/session/direct_gbq_execution.py index 3ec10bf20f..c60670b542 100644 --- a/bigframes/session/direct_gbq_execution.py +++ b/bigframes/session/direct_gbq_execution.py @@ -20,7 +20,8 @@ import google.cloud.bigquery.table as bq_table from bigframes.core import compile, nodes -from bigframes.core.compile import sqlglot +import bigframes.core.compile.ibis_compiler.ibis_compiler as ibis_compiler +import bigframes.core.compile.sqlglot.compiler as sqlglot_compiler import bigframes.core.events from bigframes.session import executor, semi_executor import bigframes.session._io.bigquery as bq_io @@ -40,7 +41,9 @@ def __init__( ): self.bqclient = bqclient self._compile_fn = ( - compile.compile_sql if compiler == "ibis" else sqlglot.compile_sql + ibis_compiler.compile_sql + if compiler == "ibis" + else sqlglot_compiler.compile_sql ) self._publisher = publisher diff --git a/tests/unit/_config/test_experiment_options.py b/tests/unit/_config/test_experiment_options.py index deeee2e46a..0e69dfe36d 100644 --- a/tests/unit/_config/test_experiment_options.py +++ b/tests/unit/_config/test_experiment_options.py @@ -46,3 +46,18 @@ def test_ai_operators_set_true_shows_warning(): options.ai_operators = True assert options.ai_operators is True + + +def test_sql_compiler_default_stable(): + options = experiment_options.ExperimentOptions() + + assert options.sql_compiler == "stable" + + +def test_sql_compiler_set_experimental_shows_warning(): + options = experiment_options.ExperimentOptions() + + with pytest.warns(FutureWarning): + options.sql_compiler = "experimental" + + assert options.sql_compiler == "experimental" From fe5e711b4b7c9eca73907672b303bedf32660276 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 2 Feb 2026 15:53:27 -0800 Subject: [PATCH 3/8] chore: remove redundant assertions (#2427) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/bigquery/_operations/table.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bigframes/bigquery/_operations/table.py b/bigframes/bigquery/_operations/table.py index 51ff78d4c8..c90f88dcd6 100644 --- a/bigframes/bigquery/_operations/table.py +++ b/bigframes/bigquery/_operations/table.py @@ -16,7 +16,6 @@ from typing import Mapping, Optional, Union -import bigframes_vendored.constants import google.cloud.bigquery import pandas as pd @@ -94,9 +93,6 @@ def create_external_table( if session is None: bpd.read_gbq_query(sql) session = bpd.get_global_session() - assert ( - session is not None - ), f"Missing connection to BigQuery. Please report how you encountered this error at {bigframes_vendored.constants.FEEDBACK_LINK}." else: session.read_gbq_query(sql) From d123a36c69bdab68279a82a66afbd1c1fd38875a Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 2 Feb 2026 16:47:37 -0800 Subject: [PATCH 4/8] chore: attach type usage to job labels (#2407) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should complete the dev on the client side. Manually verified: https://screenshot.googleplex.com/jv3UfJCm75dCzwn Related bug 406578908 🦕 --- bigframes/core/compile/configs.py | 1 + .../compile/ibis_compiler/ibis_compiler.py | 12 +++++- bigframes/core/compile/sqlglot/compiler.py | 13 ++++-- bigframes/session/bq_caching_executor.py | 4 ++ .../small/session/test_session_logging.py | 40 +++++++++++++++++++ 5 files changed, 65 insertions(+), 5 deletions(-) create mode 100644 tests/system/small/session/test_session_logging.py diff --git a/bigframes/core/compile/configs.py b/bigframes/core/compile/configs.py index 5ffca0cf43..62c28f87ca 100644 --- a/bigframes/core/compile/configs.py +++ b/bigframes/core/compile/configs.py @@ -34,3 +34,4 @@ class CompileResult: sql: str sql_schema: typing.Sequence[google.cloud.bigquery.SchemaField] row_order: typing.Optional[ordering.RowOrdering] + encoded_type_refs: str diff --git a/bigframes/core/compile/ibis_compiler/ibis_compiler.py b/bigframes/core/compile/ibis_compiler/ibis_compiler.py index 31cd9a0456..9e209ea3b3 100644 --- a/bigframes/core/compile/ibis_compiler/ibis_compiler.py +++ b/bigframes/core/compile/ibis_compiler/ibis_compiler.py @@ -29,6 +29,7 @@ import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.configs as configs import bigframes.core.compile.explode +from bigframes.core.logging import data_types as data_type_logger import bigframes.core.nodes as nodes import bigframes.core.ordering as bf_ordering import bigframes.core.rewrite as rewrites @@ -56,15 +57,20 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult: ) if request.sort_rows: result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node)) + encoded_type_refs = data_type_logger.encode_type_refs(result_node) sql = compile_result_node(result_node) return configs.CompileResult( - sql, result_node.schema.to_bigquery(), result_node.order_by + sql, + result_node.schema.to_bigquery(), + result_node.order_by, + encoded_type_refs, ) ordering: Optional[bf_ordering.RowOrdering] = result_node.order_by result_node = dataclasses.replace(result_node, order_by=None) result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node)) result_node = cast(nodes.ResultNode, rewrites.defer_selection(result_node)) + encoded_type_refs = data_type_logger.encode_type_refs(result_node) sql = compile_result_node(result_node) # Return the ordering iff no extra columns are needed to define the row order if ordering is not None: @@ -72,7 +78,9 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult: ordering if ordering.referenced_columns.issubset(result_node.ids) else None ) assert (not request.materialize_all_order_keys) or (output_order is not None) - return configs.CompileResult(sql, result_node.schema.to_bigquery(), output_order) + return configs.CompileResult( + sql, result_node.schema.to_bigquery(), output_order, encoded_type_refs + ) def _replace_unsupported_ops(node: nodes.BigFrameNode): diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 67997e16f7..f2c94f98c7 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -34,6 +34,7 @@ from bigframes.core.compile.sqlglot.expressions import typed_expr import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler import bigframes.core.compile.sqlglot.sqlglot_ir as ir +from bigframes.core.logging import data_types as data_type_logger import bigframes.core.ordering as bf_ordering from bigframes.core.rewrite import schema_binding @@ -59,23 +60,29 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult: ) if request.sort_rows: result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node)) + encoded_type_refs = data_type_logger.encode_type_refs(result_node) sql = _compile_result_node(result_node) return configs.CompileResult( - sql, result_node.schema.to_bigquery(), result_node.order_by + sql, + result_node.schema.to_bigquery(), + result_node.order_by, + encoded_type_refs, ) ordering: typing.Optional[bf_ordering.RowOrdering] = result_node.order_by result_node = dataclasses.replace(result_node, order_by=None) result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node)) + encoded_type_refs = data_type_logger.encode_type_refs(result_node) sql = _compile_result_node(result_node) - # Return the ordering iff no extra columns are needed to define the row order if ordering is not None: output_order = ( ordering if ordering.referenced_columns.issubset(result_node.ids) else None ) assert (not request.materialize_all_order_keys) or (output_order is not None) - return configs.CompileResult(sql, result_node.schema.to_bigquery(), output_order) + return configs.CompileResult( + sql, result_node.schema.to_bigquery(), output_order, encoded_type_refs + ) def _remap_variables( diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 57af7daf65..2f5ec035dc 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -322,6 +322,8 @@ def _export_gbq( clustering_fields=spec.cluster_cols if spec.cluster_cols else None, ) + # Attach data type usage to the job labels + job_config.labels["bigframes-dtypes"] = compiled.encoded_type_refs # TODO(swast): plumb through the api_name of the user-facing api that # caused this query. iterator, job = self._run_execute_query( @@ -665,6 +667,8 @@ def _execute_plan_gbq( ) job_config.destination = destination_table + # Attach data type usage to the job labels + job_config.labels["bigframes-dtypes"] = compiled.encoded_type_refs iterator, query_job = self._run_execute_query( sql=compiled.sql, job_config=job_config, diff --git a/tests/system/small/session/test_session_logging.py b/tests/system/small/session/test_session_logging.py new file mode 100644 index 0000000000..b951582309 --- /dev/null +++ b/tests/system/small/session/test_session_logging.py @@ -0,0 +1,40 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +from bigframes.core.logging import data_types +import bigframes.session._io.bigquery as bq_io + + +def test_data_type_logging(scalars_df_index): + s = scalars_df_index["int64_col"] + 1.5 + + # We want to check the job_config passed to _query_and_wait_bigframes + with mock.patch( + "bigframes.session._io.bigquery.start_query_with_client", + wraps=bq_io.start_query_with_client, + ) as mock_query: + s.to_pandas() + + # Fetch job labels sent to the BQ client and verify their values + assert mock_query.called + call_args = mock_query.call_args + job_config = call_args.kwargs.get("job_config") + assert job_config is not None + job_labels = job_config.labels + assert "bigframes-dtypes" in job_labels + assert job_labels["bigframes-dtypes"] == data_types.encode_type_refs( + s._block._expr.node + ) From f5fdb2a7d0c63d79c33323663c0959dc2a1097de Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 2 Feb 2026 17:24:15 -0800 Subject: [PATCH 5/8] feat: Suppress redundant "Completed" status in progress callback (#2419) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR refines the visual feedback provided during operation progress. Specifically, it prevents the display of a standalone "✅ Completed." message and green checkmark when no query execution information (like slot time or bytes processed) is available. This often occurs during metadata-only operations or cached results where the "Completed" status is redundant. Fixes #<479944983> 🦕 --- bigframes/formatting_helpers.py | 10 +- notebooks/dataframes/anywidget_mode.ipynb | 345 +++++++++------------- 2 files changed, 142 insertions(+), 213 deletions(-) diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 74d870baf7..1e3cdabdaf 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -200,10 +200,12 @@ def progress_callback( display_id=current_display_id, ) elif isinstance(event, bigframes.core.events.ExecutionFinished): - display.update_display( - display.HTML(f"✅ Completed. {previous_display_html}"), - display_id=current_display_id, - ) + if previous_display_html: + display.update_display( + display.HTML(f"✅ Completed. {previous_display_html}"), + display_id=current_display_id, + ) + elif isinstance(event, bigframes.core.events.SessionClosed): display.update_display( display.HTML(f"Session {event.session_id} closed."), diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 5dd8af1c5f..e9491610ac 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -91,9 +91,7 @@ "outputs": [ { "data": { - "text/html": [ - "✅ Completed. " - ], + "text/html": [], "text/plain": [ "" ] @@ -119,17 +117,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "state gender year name number\n", - " AL F 1910 Annie 482\n", - " AL F 1910 Myrtle 104\n", - " AR F 1910 Lillian 56\n", - " CT F 1910 Anne 38\n", - " CT F 1910 Frances 45\n", - " FL F 1910 Margaret 53\n", - " GA F 1910 Mae 73\n", - " GA F 1910 Beatrice 96\n", - " GA F 1910 Lola 47\n", - " IA F 1910 Viola 49\n", + "state gender year name number\n", + " AL F 1910 Lillian 99\n", + " AL F 1910 Ruby 204\n", + " AL F 1910 Helen 76\n", + " AL F 1910 Eunice 41\n", + " AR F 1910 Dora 42\n", + " CA F 1910 Edna 62\n", + " CA F 1910 Helen 239\n", + " CO F 1910 Alice 46\n", + " FL F 1910 Willie 71\n", + " FL F 1910 Thelma 65\n", "...\n", "\n", "[5552452 rows x 5 columns]\n" @@ -143,32 +141,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "220340b0", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "\n", - " Query started with request ID bigframes-dev:US.161c75bd-f9f8-4b21-8a45-1d7dfc659034.
SQL
SELECT\n",
-       "`state` AS `state`,\n",
-       "`gender` AS `gender`,\n",
-       "`year` AS `year`,\n",
-       "`name` AS `name`,\n",
-       "`number` AS `number`\n",
-       "FROM\n",
-       "(SELECT\n",
-       "  `t0`.`state`,\n",
-       "  `t0`.`gender`,\n",
-       "  `t0`.`year`,\n",
-       "  `t0`.`name`,\n",
-       "  `t0`.`number`,\n",
-       "  `t0`.`bfuid_col_2` AS `bfuid_col_15`\n",
-       "FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52._c58be946_1477_4c00_b699_0ae022f13563_bqdf_8e323719-899f-4da2-89cd-2dbb53ab1dfc` AS `t0`)\n",
-       "ORDER BY `bfuid_col_15` ASC NULLS LAST
\n", - " " - ], + "text/html": [], "text/plain": [ "" ] @@ -178,11 +157,7 @@ }, { "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 7 seconds of slot time. [Job bigframes-dev:US.job_IuiJsjhfPtOrKuTIOqPIjnVLX820 details]\n", - " " - ], + "text/html": [], "text/plain": [ "" ] @@ -193,7 +168,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e68fbb9eb4d24bab837c77730d31c8a1", + "model_id": "6fb22be7f21f4d1dacd76dc62a1a7818", "version_major": 2, "version_minor": 1 }, @@ -229,80 +204,80 @@ " AL\n", " F\n", " 1910\n", - " Hazel\n", - " 51\n", + " Lillian\n", + " 99\n", " \n", " \n", " 1\n", " AL\n", " F\n", " 1910\n", - " Lucy\n", - " 76\n", + " Ruby\n", + " 204\n", " \n", " \n", " 2\n", - " AR\n", + " AL\n", " F\n", " 1910\n", - " Nellie\n", - " 39\n", + " Helen\n", + " 76\n", " \n", " \n", " 3\n", - " AR\n", + " AL\n", " F\n", " 1910\n", - " Lena\n", - " 40\n", + " Eunice\n", + " 41\n", " \n", " \n", " 4\n", - " CO\n", + " AR\n", " F\n", " 1910\n", - " Thelma\n", - " 36\n", + " Dora\n", + " 42\n", " \n", " \n", " 5\n", - " CO\n", + " CA\n", " F\n", " 1910\n", - " Ruth\n", - " 68\n", + " Edna\n", + " 62\n", " \n", " \n", " 6\n", - " CT\n", + " CA\n", " F\n", " 1910\n", - " Elizabeth\n", - " 86\n", + " Helen\n", + " 239\n", " \n", " \n", " 7\n", - " DC\n", + " CO\n", " F\n", " 1910\n", - " Mary\n", - " 80\n", + " Alice\n", + " 46\n", " \n", " \n", " 8\n", " FL\n", " F\n", " 1910\n", - " Annie\n", - " 101\n", + " Willie\n", + " 71\n", " \n", " \n", " 9\n", " FL\n", " F\n", " 1910\n", - " Alma\n", - " 39\n", + " Thelma\n", + " 65\n", " \n", " \n", "\n", @@ -310,67 +285,25 @@ "[5552452 rows x 5 columns in total]" ], "text/plain": [ - "state gender year name number\n", - " AL F 1910 Hazel 51\n", - " AL F 1910 Lucy 76\n", - " AR F 1910 Nellie 39\n", - " AR F 1910 Lena 40\n", - " CO F 1910 Thelma 36\n", - " CO F 1910 Ruth 68\n", - " CT F 1910 Elizabeth 86\n", - " DC F 1910 Mary 80\n", - " FL F 1910 Annie 101\n", - " FL F 1910 Alma 39\n", + "state gender year name number\n", + " AL F 1910 Lillian 99\n", + " AL F 1910 Ruby 204\n", + " AL F 1910 Helen 76\n", + " AL F 1910 Eunice 41\n", + " AR F 1910 Dora 42\n", + " CA F 1910 Edna 62\n", + " CA F 1910 Helen 239\n", + " CO F 1910 Alice 46\n", + " FL F 1910 Willie 71\n", + " FL F 1910 Thelma 65\n", "...\n", "\n", "[5552452 rows x 5 columns]" ] }, - "execution_count": 13, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 9 seconds of slot time. [Job bigframes-dev:US.job_IEjIRaqt2w-_pAttPw1VAVuRPxA7 details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 5 seconds of slot time. [Job bigframes-dev:US.job_Mi-3m2AkEC1iPgWi7hmcWa1M1oIA details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 6 seconds of slot time. [Job bigframes-dev:US.job_j8pvY385WwIY7tGvhI7Yxc62aBwd details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" } ], "source": [ @@ -396,7 +329,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 171.4 MB in 30 seconds of slot time. [Job bigframes-dev:US.ff90d507-bec8-4d24-abc3-0209ac28e21f details]\n", + " Query processed 171.4 MB in 41 seconds of slot time. [Job bigframes-dev:US.492b5260-9f44-495c-be09-2ae1324a986c details]\n", " " ], "text/plain": [ @@ -422,9 +355,7 @@ }, { "data": { - "text/html": [ - "✅ Completed. " - ], + "text/html": [], "text/plain": [ "" ] @@ -477,7 +408,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 88.8 MB in 3 seconds of slot time. [Job bigframes-dev:US.job_517TdI--FMoURkV7QQNMltY_-dZ7 details]\n", + " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_gsx0h2jHoOSYwqGKUS3lAYLf_qi3 details]\n", " " ], "text/plain": [ @@ -491,7 +422,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_rCeYkeBPqmTKNFWFgwXjz5Ed8uWI details]\n", + " Query processed 88.8 MB in 3 seconds of slot time. [Job bigframes-dev:US.job_1VivAJ2InPdg5RXjWfvAJ1B0oxO3 details]\n", " " ], "text/plain": [ @@ -504,7 +435,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3e630b1a56c740e781772ca5f5c7267a", + "model_id": "7d82208e7e5e40dd9dbf64c4c561cab3", "version_major": 2, "version_minor": 1 }, @@ -606,7 +537,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 215.9 MB in 11 seconds of slot time. [Job bigframes-dev:US.job_XwXTDb6gWVkuyIFMeWA0waE33bSg details]\n", + " Query processed 215.9 MB in 10 seconds of slot time. [Job bigframes-dev:US.job_cmNyG5sJ1IDCyFINx7teExQOZ6UQ details]\n", " " ], "text/plain": [ @@ -620,7 +551,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 215.9 MB in 7 seconds of slot time. [Job bigframes-dev:US.job_bCW0LYK5_PzyyGPf9OAg4YfNMG1C details]\n", + " Query processed 215.9 MB in 8 seconds of slot time. [Job bigframes-dev:US.job_aQvP3Sn04Ss4flSLaLhm0sKzFvrd details]\n", " " ], "text/plain": [ @@ -640,12 +571,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a6a2b19314b04283a5a66ca9d66eb771", + "model_id": "52d11291ba1d42e6b544acbd86eef6cf", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -755,12 +686,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "beb362548a6b4fd4a163569edd6f1a90", + "model_id": "32c61c84740d45a0ac37202a76c7c14e", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -804,7 +735,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 19 seconds of slot time.\n", + " Query processed 85.9 kB in 21 seconds of slot time.\n", " " ], "text/plain": [ @@ -826,9 +757,7 @@ }, { "data": { - "text/html": [ - "✅ Completed. " - ], + "text/html": [], "text/plain": [ "" ] @@ -838,9 +767,7 @@ }, { "data": { - "text/html": [ - "✅ Completed. " - ], + "text/html": [], "text/plain": [ "" ] @@ -865,7 +792,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "02a46cf499b442d4bfe03934195e67df", + "model_id": "9d60a47296214553bb10c434b5ee8330", "version_major": 2, "version_minor": 1 }, @@ -912,17 +839,17 @@ " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", " DE\n", - " 03.10.2018\n", - " H01L 21/20\n", - " <NA>\n", - " 18166536.5\n", - " 16.02.2016\n", + " 29.08.018\n", + " E04H 6/12\n", " <NA>\n", - " Scheider, Sascha et al\n", - " EV Group E. Thallner GmbH\n", - " Kurz, Florian\n", - " VORRICHTUNG ZUM BONDEN VON SUBSTRATEN\n", - " EP 3 382 744 A1\n", + " 18157874.1\n", + " 21.02.2018\n", + " 22.02.2017\n", + " Liedtke & Partner Patentanw√§lte\n", + " SHB Hebezeugbau GmbH\n", + " VOLGER, Alexander\n", + " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", + " EP 3 366 869 A1\n", " \n", " \n", " 1\n", @@ -931,16 +858,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " A01K 31/00\n", + " H05B 6/12\n", " <NA>\n", - " 18171005.4\n", - " 05.02.2015\n", - " 05.02.2014\n", - " Stork Bamberger Patentanw√§lte\n", - " Linco Food Systems A/S\n", - " Thrane, Uffe\n", - " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", - " EP 3 381 276 A1\n", + " 18165514.3\n", + " 03.04.2018\n", + " 30.03.2017\n", + " <NA>\n", + " BSH Hausger√§te GmbH\n", + " Acero Acero, Jesus\n", + " VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG\n", + " EP 3 383 141 A2\n", " \n", " \n", " 2\n", @@ -949,16 +876,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " G06F 11/30\n", + " H01L 21/20\n", " <NA>\n", - " 18157347.8\n", - " 19.02.2018\n", - " 31.03.2017\n", - " Hoffmann Eitle\n", - " FUJITSU LIMITED\n", - " Kukihara, Kensuke\n", - " METHOD EXECUTED BY A COMPUTER, INFORMATION PRO...\n", - " EP 3 382 553 A1\n", + " 18166536.5\n", + " 16.02.2016\n", + " <NA>\n", + " Scheider, Sascha et al\n", + " EV Group E. Thallner GmbH\n", + " Kurz, Florian\n", + " VORRICHTUNG ZUM BONDEN VON SUBSTRATEN\n", + " EP 3 382 744 A1\n", " \n", " \n", " 3\n", @@ -967,16 +894,16 @@ " EU\n", " DE\n", " 03.10.2018\n", - " H05B 6/12\n", - " <NA>\n", - " 18165514.3\n", - " 03.04.2018\n", - " 30.03.2017\n", + " G06F 11/30\n", " <NA>\n", - " BSH Hausger√§te GmbH\n", - " Acero Acero, Jesus\n", - " VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG\n", - " EP 3 383 141 A2\n", + " 18157347.8\n", + " 19.02.2018\n", + " 31.03.2017\n", + " Hoffmann Eitle\n", + " FUJITSU LIMITED\n", + " Kukihara, Kensuke\n", + " METHOD EXECUTED BY A COMPUTER, INFORMATION PRO...\n", + " EP 3 382 553 A1\n", " \n", " \n", " 4\n", @@ -984,17 +911,17 @@ " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", " DE\n", - " 29.08.018\n", - " E04H 6/12\n", + " 03.10.2018\n", + " A01K 31/00\n", " <NA>\n", - " 18157874.1\n", - " 21.02.2018\n", - " 22.02.2017\n", - " Liedtke & Partner Patentanw√§lte\n", - " SHB Hebezeugbau GmbH\n", - " VOLGER, Alexander\n", - " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", - " EP 3 366 869 A1\n", + " 18171005.4\n", + " 05.02.2015\n", + " 05.02.2014\n", + " Stork Bamberger Patentanw√§lte\n", + " Linco Food Systems A/S\n", + " Thrane, Uffe\n", + " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", + " EP 3 381 276 A1\n", " \n", " \n", "\n", @@ -1017,32 +944,32 @@ "4 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", "\n", " publication_date class_international class_us application_number \\\n", - "0 03.10.2018 H01L 21/20 18166536.5 \n", - "1 03.10.2018 A01K 31/00 18171005.4 \n", - "2 03.10.2018 G06F 11/30 18157347.8 \n", - "3 03.10.2018 H05B 6/12 18165514.3 \n", - "4 29.08.018 E04H 6/12 18157874.1 \n", + "0 29.08.018 E04H 6/12 18157874.1 \n", + "1 03.10.2018 H05B 6/12 18165514.3 \n", + "2 03.10.2018 H01L 21/20 18166536.5 \n", + "3 03.10.2018 G06F 11/30 18157347.8 \n", + "4 03.10.2018 A01K 31/00 18171005.4 \n", "\n", " filing_date priority_date_eu representative_line_1_eu \\\n", - "0 16.02.2016 Scheider, Sascha et al \n", - "1 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", - "2 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "3 03.04.2018 30.03.2017 \n", - "4 21.02.2018 22.02.2017 Liedtke & Partner Patentanw√§lte \n", + "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanw√§lte \n", + "1 03.04.2018 30.03.2017 \n", + "2 16.02.2016 Scheider, Sascha et al \n", + "3 19.02.2018 31.03.2017 Hoffmann Eitle \n", + "4 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", "\n", " applicant_line_1 inventor_line_1 \\\n", - "0 EV Group E. Thallner GmbH Kurz, Florian \n", - "1 Linco Food Systems A/S Thrane, Uffe \n", - "2 FUJITSU LIMITED Kukihara, Kensuke \n", - "3 BSH Hausger√§te GmbH Acero Acero, Jesus \n", - "4 SHB Hebezeugbau GmbH VOLGER, Alexander \n", + "0 SHB Hebezeugbau GmbH VOLGER, Alexander \n", + "1 BSH Hausger√§te GmbH Acero Acero, Jesus \n", + "2 EV Group E. Thallner GmbH Kurz, Florian \n", + "3 FUJITSU LIMITED Kukihara, Kensuke \n", + "4 Linco Food Systems A/S Thrane, Uffe \n", "\n", " title_line_1 number \n", - "0 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", - "1 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", - "2 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "3 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", - "4 STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER EP 3 366 869 A1 \n", + "0 STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER EP 3 366 869 A1 \n", + "1 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", + "2 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", + "3 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", + "4 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", "\n", "[5 rows x 15 columns]" ] From 326e78bf1554959bd4a62a7ee6b41117c5d988fe Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 3 Feb 2026 10:13:56 -0800 Subject: [PATCH 6/8] chore: librarian release pull request:20260202T232430Z (#2428) PR created by the Librarian CLI to initialize a release. Merging this PR will auto trigger a release. Librarian Version: v0.7.0 Language Image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:e7cc6823efb073a8a26e7cefdd869f12ec228abfbd2a44aa9a7eacc284023677
bigframes: 2.34.0 ## [2.34.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.33.0...v2.34.0) (2026-02-02) ### Features * add `bigframes.pandas.options.experiments.sql_compiler` for switching the backend compiler (#2417) ([7eba6ee03f07938315d99e2aeaf72368c02074cf](https://github.com/googleapis/python-bigquery-dataframes/commit/7eba6ee03f07938315d99e2aeaf72368c02074cf)) * add bigquery.ml.generate_embedding function (#2422) ([35f3f5e6f8c64b47e6e7214034f96f047785e647](https://github.com/googleapis/python-bigquery-dataframes/commit/35f3f5e6f8c64b47e6e7214034f96f047785e647)) * add bigquery.create_external_table method (#2415) ([76db2956e505aec4f1055118ac7ca523facc10ff](https://github.com/googleapis/python-bigquery-dataframes/commit/76db2956e505aec4f1055118ac7ca523facc10ff)) * add deprecation warnings for .blob accessor and read_gbq_object_table (#2408) ([7261a4ea5cdab6b30f5bc333501648c60e70be59](https://github.com/googleapis/python-bigquery-dataframes/commit/7261a4ea5cdab6b30f5bc333501648c60e70be59)) * add bigquery.ml.generate_text function (#2403) ([5ac681028624de15e31f0c2ae360b47b2dcf1e8d](https://github.com/googleapis/python-bigquery-dataframes/commit/5ac681028624de15e31f0c2ae360b47b2dcf1e8d)) ### Bug Fixes * broken job url (#2411) ([fcb5bc1761c656e1aec61dbcf96a36d436833b7a](https://github.com/googleapis/python-bigquery-dataframes/commit/fcb5bc1761c656e1aec61dbcf96a36d436833b7a)) --- .librarian/state.yaml | 2 +- CHANGELOG.md | 16 ++++++++++++++++ bigframes/version.py | 4 ++-- third_party/bigframes_vendored/version.py | 4 ++-- 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/.librarian/state.yaml b/.librarian/state.yaml index 4dba64808e..21903a5124 100644 --- a/.librarian/state.yaml +++ b/.librarian/state.yaml @@ -1,7 +1,7 @@ image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:e7cc6823efb073a8a26e7cefdd869f12ec228abfbd2a44aa9a7eacc284023677 libraries: - id: bigframes - version: 2.33.0 + version: 2.34.0 last_generated_commit: "" apis: [] source_roots: diff --git a/CHANGELOG.md b/CHANGELOG.md index 090cf2ee57..f54231f540 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,22 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.34.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.33.0...v2.34.0) (2026-02-02) + + +### Features + +* add `bigframes.pandas.options.experiments.sql_compiler` for switching the backend compiler (#2417) ([7eba6ee03f07938315d99e2aeaf72368c02074cf](https://github.com/googleapis/python-bigquery-dataframes/commit/7eba6ee03f07938315d99e2aeaf72368c02074cf)) +* add bigquery.ml.generate_embedding function (#2422) ([35f3f5e6f8c64b47e6e7214034f96f047785e647](https://github.com/googleapis/python-bigquery-dataframes/commit/35f3f5e6f8c64b47e6e7214034f96f047785e647)) +* add bigquery.create_external_table method (#2415) ([76db2956e505aec4f1055118ac7ca523facc10ff](https://github.com/googleapis/python-bigquery-dataframes/commit/76db2956e505aec4f1055118ac7ca523facc10ff)) +* add deprecation warnings for .blob accessor and read_gbq_object_table (#2408) ([7261a4ea5cdab6b30f5bc333501648c60e70be59](https://github.com/googleapis/python-bigquery-dataframes/commit/7261a4ea5cdab6b30f5bc333501648c60e70be59)) +* add bigquery.ml.generate_text function (#2403) ([5ac681028624de15e31f0c2ae360b47b2dcf1e8d](https://github.com/googleapis/python-bigquery-dataframes/commit/5ac681028624de15e31f0c2ae360b47b2dcf1e8d)) + + +### Bug Fixes + +* broken job url (#2411) ([fcb5bc1761c656e1aec61dbcf96a36d436833b7a](https://github.com/googleapis/python-bigquery-dataframes/commit/fcb5bc1761c656e1aec61dbcf96a36d436833b7a)) + ## [2.33.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.32.0...v2.33.0) (2026-01-22) diff --git a/bigframes/version.py b/bigframes/version.py index 1e9ed79f82..a6862ee201 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.33.0" +__version__ = "2.34.0" # {x-release-please-start-date} -__release_date__ = "2026-01-22" +__release_date__ = "2026-02-02" # {x-release-please-end} diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 1e9ed79f82..a6862ee201 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.33.0" +__version__ = "2.34.0" # {x-release-please-start-date} -__release_date__ = "2026-01-22" +__release_date__ = "2026-02-02" # {x-release-please-end} From 0c705f5151fbc69c89c932bb108ea20d163361d5 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 4 Feb 2026 18:37:55 +0000 Subject: [PATCH 7/8] feat: Deprecate blob.exif and replace with sample notebook --- bigframes/blob/_functions.py | 52 +-- bigframes/operations/blob.py | 70 ---- .../multimodal/image_metadata_exif.ipynb | 319 ++++++++++++++++++ tests/system/large/blob/test_function.py | 51 --- 4 files changed, 320 insertions(+), 172 deletions(-) create mode 100644 notebooks/multimodal/image_metadata_exif.ipynb diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 3dfe38811b..f8a79ca9b0 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -126,57 +126,7 @@ def udf(self): return self._session.read_gbq_function(udf_name) -def exif_func(src_obj_ref_rt: str, verbose: bool) -> str: - try: - import io - import json - - from PIL import ExifTags, Image - import requests - from requests import adapters - - session = requests.Session() - session.mount("https://", adapters.HTTPAdapter(max_retries=3)) - - src_obj_ref_rt_json = json.loads(src_obj_ref_rt) - src_url = src_obj_ref_rt_json["access_urls"]["read_url"] - - response = session.get(src_url, timeout=30) - response.raise_for_status() - bts = response.content - - image = Image.open(io.BytesIO(bts)) - exif_data = image.getexif() - exif_dict = {} - - if exif_data: - for tag, value in exif_data.items(): - tag_name = ExifTags.TAGS.get(tag, tag) - # Convert non-serializable types to strings - try: - json.dumps(value) - exif_dict[tag_name] = value - except (TypeError, ValueError): - exif_dict[tag_name] = str(value) - - if verbose: - return json.dumps({"status": "", "content": json.dumps(exif_dict)}) - else: - return json.dumps(exif_dict) - - except Exception as e: - # Return error as JSON with error field - error_result = {"status": f"{type(e).__name__}: {str(e)}", "content": "{}"} - if verbose: - return json.dumps(error_result) - else: - return "{}" - - -exif_func_def = FunctionDef(exif_func, ["pillow", "requests"]) - - -# Blur images. Takes ObjectRefRuntime as JSON string. Outputs ObjectRefRuntime JSON string. +# Blur images. Takes ObjectRefRuntime as JSON string. Outputs ObjectRefRuntime JSON string. │ def image_blur_func( src_obj_ref_rt: str, dst_obj_ref_rt: str, diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 9210addaa8..edb4ebf904 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -336,76 +336,6 @@ def get_runtime_json_str( runtime = self._get_runtime(mode=mode, with_metadata=with_metadata) return runtime._apply_unary_op(ops.ToJSONString()) - def exif( - self, - *, - engine: Literal[None, "pillow"] = None, - connection: Optional[str] = None, - max_batching_rows: int = 8192, - container_cpu: Union[float, int] = 0.33, - container_memory: str = "512Mi", - verbose: bool = False, - ) -> bigframes.series.Series: - """Extract EXIF data. Now only support image types. - - Args: - engine ('pillow' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. - connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. - max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function. - container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. - container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. - verbose (bool, default False): If True, returns a struct with status and content fields. If False, returns only the content. - - Returns: - bigframes.series.Series: JSON series of key-value pairs if verbose=False, or struct with status and content if verbose=True. - - Raises: - ValueError: If engine is not 'pillow'. - RuntimeError: If EXIF extraction fails or returns invalid structure. - """ - if engine is None or engine.casefold() != "pillow": - raise ValueError("Must specify the engine, supported value is 'pillow'.") - - import bigframes.bigquery as bbq - import bigframes.blob._functions as blob_func - import bigframes.pandas as bpd - - connection = self._resolve_connection(connection) - df = self.get_runtime_json_str(mode="R").to_frame() - df["verbose"] = verbose - - exif_udf = blob_func.TransformFunction( - blob_func.exif_func_def, - session=self._data._block.session, - connection=connection, - max_batching_rows=max_batching_rows, - container_cpu=container_cpu, - container_memory=container_memory, - ).udf() - - res = self._apply_udf_or_raise_error(df, exif_udf, "EXIF extraction") - - if verbose: - try: - exif_content_series = bbq.parse_json( - res._apply_unary_op(ops.JSONValue(json_path="$.content")) - ).rename("exif_content") - exif_status_series = res._apply_unary_op( - ops.JSONValue(json_path="$.status") - ) - except Exception as e: - raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e - results_df = bpd.DataFrame( - {"status": exif_status_series, "content": exif_content_series} - ) - results_struct = bbq.struct(results_df).rename("exif_results") - return results_struct - else: - try: - return bbq.parse_json(res) - except Exception as e: - raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e - def image_blur( self, ksize: tuple[int, int], diff --git a/notebooks/multimodal/image_metadata_exif.ipynb b/notebooks/multimodal/image_metadata_exif.ipynb new file mode 100644 index 0000000000..41dea4b6ba --- /dev/null +++ b/notebooks/multimodal/image_metadata_exif.ipynb @@ -0,0 +1,319 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Image Metadata (EXIF) Extraction with BigFrames\n", + "\n", + "This notebook demonstrates how to extract EXIF metadata from images stored in Google Cloud Storage using BigQuery DataFrames (BigFrames) user-defined functions (UDFs).\n", + "\n", + "## Setup\n", + "\n", + "Please provide your project ID and location. The notebook uses the default BigFrames connection and a sample dataset name by default." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "import bigframes.bigquery as bbq\n", + "\n", + "# @title Configuration\n", + "PROJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}\n", + "LOCATION = \"us\" # @param {type:\"string\"}\n", + "\n", + "# Dataset where the UDF will be created.\n", + "DATASET_ID = \"bigframes_samples\" # @param {type:\"string\"}\n", + "\n", + "# A BigQuery connection is required for the UDF to access Google Cloud Storage.\n", + "# \"bigframes-default-connection\" is the default connection created by BigFrames.\n", + "CONNECTION_ID = \"bigframes-default-connection\" # @param {type:\"string\"}\n", + "\n", + "# Construct the canonical connection ID\n", + "FULL_CONNECTION_ID = f\"{PROJECT_ID}.{LOCATION}.{CONNECTION_ID}\"\n", + "\n", + "# Initialize BigFrames\n", + "bpd.options.bigquery.project = PROJECT_ID\n", + "bpd.options.bigquery.location = LOCATION" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define the EXIF Extraction UDF\n", + "\n", + "We will define a BigQuery remote UDF that takes a BigQuery `ObjectRef` runtime JSON string, downloads the image, and extracts EXIF data using the `Pillow` library.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:150: PreviewWarning: udf is in preview.\n", + " return global_session.with_default_session(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes in a moment of slot time. [Job bigframes-dev:US.8160cd8b-7a06-4eb6-9cb7-4b5cc72e96b9 details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "@bpd.udf(\n", + " input_types=[str],\n", + " output_type=str,\n", + " dataset=DATASET_ID,\n", + " name=\"extract_exif\",\n", + " bigquery_connection=FULL_CONNECTION_ID,\n", + " packages=[\"pillow\", \"requests\"],\n", + " max_batching_rows=8192,\n", + " container_cpu=0.33,\n", + " container_memory=\"512Mi\"\n", + ")\n", + "def extract_exif(src_obj_ref_rt: str) -> str:\n", + " import io\n", + " import json\n", + " from PIL import ExifTags, Image\n", + " import requests\n", + " from requests import adapters\n", + " session = requests.Session()\n", + " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", + " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", + " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", + " response = session.get(src_url, timeout=30)\n", + " bts = response.content\n", + " image = Image.open(io.BytesIO(bts))\n", + " exif_data = image.getexif()\n", + " exif_dict = {}\n", + " if exif_data:\n", + " for tag, value in exif_data.items():\n", + " tag_name = ExifTags.TAGS.get(tag, tag)\n", + " exif_dict[tag_name] = value\n", + " return json.dumps(exif_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Extract EXIF from Images\n", + "\n", + "Now we can use this function on a BigFrames Series of image URIs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/utils.py:228: PreviewWarning: The JSON-related API `parse_json` is in preview. Its behavior may\n", + "change in future versions.\n", + " warnings.warn(bfe.format_message(msg), category=bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query started. [Job bigframes-dev:US.071a66b1-7952-4126-a750-4febe1277721 details]
SQL
SELECT\n",
+       "`bfuid_col_107` AS `bfuid_col_107`,\n",
+       "`bfuid_col_113` AS `bfuid_col_113`,\n",
+       "`bfuid_col_114` AS `bfuid_col_114`\n",
+       "FROM\n",
+       "(SELECT\n",
+       "  `t2`.`uri`,\n",
+       "  `t2`.`generation`,\n",
+       "  `t2`.`content_type`,\n",
+       "  `t2`.`size`,\n",
+       "  `t2`.`md5_hash`,\n",
+       "  `t2`.`updated`,\n",
+       "  `t2`.`metadata`,\n",
+       "  `t2`.`ref`,\n",
+       "  `t2`.`bfuid_col_115`,\n",
+       "  `t2`.`bfuid_col_107`,\n",
+       "  `t2`.`bfuid_col_108`,\n",
+       "  `t2`.`bfuid_col_109`,\n",
+       "  `t2`.`bfuid_col_110`,\n",
+       "  `t2`.`bfuid_col_111`,\n",
+       "  `t2`.`bfuid_col_112`,\n",
+       "  PARSE_JSON(`t2`.`bfuid_col_112`) AS `bfuid_col_113`,\n",
+       "  `t2`.`bfuid_col_107` AS `bfuid_col_114`\n",
+       "FROM (\n",
+       "  SELECT\n",
+       "    `t1`.`uri`,\n",
+       "    `t1`.`generation`,\n",
+       "    `t1`.`content_type`,\n",
+       "    `t1`.`size`,\n",
+       "    `t1`.`md5_hash`,\n",
+       "    `t1`.`updated`,\n",
+       "    `t1`.`metadata`,\n",
+       "    `t1`.`ref`,\n",
+       "    CONCAT(\n",
+       "      CAST(FARM_FINGERPRINT(\n",
+       "        CONCAT(\n",
+       "          CONCAT('\\\\', REPLACE(COALESCE(`t1`.`uri`, ''), '\\\\', '\\\\\\\\')),\n",
+       "          CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`generation` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
+       "          CONCAT('\\\\', REPLACE(COALESCE(`t1`.`content_type`, ''), '\\\\', '\\\\\\\\')),\n",
+       "          CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`size` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
+       "          CONCAT('\\\\', REPLACE(COALESCE(`t1`.`md5_hash`, ''), '\\\\', '\\\\\\\\')),\n",
+       "          CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`updated` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
+       "          CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`metadata`), ''), '\\\\', '\\\\\\\\')),\n",
+       "          CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`ref`), ''), '\\\\', '\\\\\\\\'))\n",
+       "        )\n",
+       "      ) AS STRING),\n",
+       "      CAST(FARM_FINGERPRINT(\n",
+       "        CONCAT(\n",
+       "          CONCAT(\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(`t1`.`uri`, ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`generation` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(`t1`.`content_type`, ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`size` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(`t1`.`md5_hash`, ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`updated` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`metadata`), ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`ref`), ''), '\\\\', '\\\\\\\\'))\n",
+       "          ),\n",
+       "          '_'\n",
+       "        )\n",
+       "      ) AS STRING),\n",
+       "      CAST(RAND() AS STRING)\n",
+       "    ) AS `bfuid_col_115`,\n",
+       "    ROW_NUMBER() OVER (\n",
+       "      ORDER BY CONCAT(\n",
+       "        CAST(FARM_FINGERPRINT(\n",
+       "          CONCAT(\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(`t1`.`uri`, ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`generation` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(`t1`.`content_type`, ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`size` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(`t1`.`md5_hash`, ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`updated` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`metadata`), ''), '\\\\', '\\\\\\\\')),\n",
+       "            CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`ref`), ''), '\\\\', '\\\\\\\\'))\n",
+       "          )\n",
+       "        ) AS STRING),\n",
+       "        CAST(FARM_FINGERPRINT(\n",
+       "          CONCAT(\n",
+       "            CONCAT(\n",
+       "              CONCAT('\\\\', REPLACE(COALESCE(`t1`.`uri`, ''), '\\\\', '\\\\\\\\')),\n",
+       "              CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`generation` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
+       "              CONCAT('\\\\', REPLACE(COALESCE(`t1`.`content_type`, ''), '\\\\', '\\\\\\\\')),\n",
+       "              CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`size` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
+       "              CONCAT('\\\\', REPLACE(COALESCE(`t1`.`md5_hash`, ''), '\\\\', '\\\\\\\\')),\n",
+       "              CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`updated` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
+       "              CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`metadata`), ''), '\\\\', '\\\\\\\\')),\n",
+       "              CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`ref`), ''), '\\\\', '\\\\\\\\'))\n",
+       "            ),\n",
+       "            '_'\n",
+       "          )\n",
+       "        ) AS STRING),\n",
+       "        CAST(RAND() AS STRING)\n",
+       "      ) ASC\n",
+       "    ) - 1 AS `bfuid_col_107`,\n",
+       "    `OBJ.MAKE_REF`(`t1`.`uri`, 'bigframes-dev.US.bigframes-default-connection') AS `bfuid_col_108`,\n",
+       "    `OBJ.GET_ACCESS_URL`(`OBJ.MAKE_REF`(`t1`.`uri`, 'bigframes-dev.US.bigframes-default-connection'), 'R') AS `bfuid_col_109`,\n",
+       "    to_json_string(\n",
+       "      `OBJ.GET_ACCESS_URL`(`OBJ.MAKE_REF`(`t1`.`uri`, 'bigframes-dev.US.bigframes-default-connection'), 'R')\n",
+       "    ) AS `bfuid_col_110`,\n",
+       "    to_json_string(\n",
+       "      `OBJ.GET_ACCESS_URL`(`OBJ.MAKE_REF`(`t1`.`uri`, 'bigframes-dev.US.bigframes-default-connection'), 'R')\n",
+       "    ) AS `bfuid_col_111`,\n",
+       "    `bigframes-dev.bigframes_samples.extract_exif`(\n",
+       "      to_json_string(\n",
+       "        `OBJ.GET_ACCESS_URL`(`OBJ.MAKE_REF`(`t1`.`uri`, 'bigframes-dev.US.bigframes-default-connection'), 'R')\n",
+       "      )\n",
+       "    ) AS `bfuid_col_112`\n",
+       "  FROM (\n",
+       "    SELECT\n",
+       "      *\n",
+       "    FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20260204_session9a8d0a_8b743a91e1ad4be5bc629e595991d905` AS `t0`\n",
+       "  ) AS `t1`\n",
+       ") AS `t2`)
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create a Multimodal DataFrame from the sample image URIs\n", + "exif_image_df = bpd.from_glob_path(\n", + " \"gs://bigframes_blob_test/images_exif/*\",\n", + " name=\"blob_col\",\n", + ")\n", + "\n", + "# Generate a JSON string containing the runtime information (including signed read URLs)\n", + "# This allows the UDF to download the images from Google Cloud Storage\n", + "access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n", + "\n", + "# Apply the BigQuery Python UDF to the runtime JSON strings\n", + "# We cast to string to ensure the input matches the UDF's signature\n", + "exif_json = access_urls.astype(str).apply(extract_exif)\n", + "\n", + "# Parse the resulting JSON strings back into a structured JSON type for easier access\n", + "actual = bbq.parse_json(exif_json)\n", + "\n", + "actual" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 7963fabd0b..94b3ce2ca6 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -52,57 +52,6 @@ def images_output_uris(images_output_folder: str) -> list[str]: ] -def test_blob_exif( - bq_connection: str, - session: bigframes.Session, -): - exif_image_df = session.from_glob_path( - "gs://bigframes_blob_test/images_exif/*", - name="blob_col", - connection=bq_connection, - ) - - actual = exif_image_df["blob_col"].blob.exif( - engine="pillow", connection=bq_connection, verbose=False - ) - expected = bpd.Series( - ['{"ExifOffset": 47, "Make": "MyCamera"}'], - session=session, - dtype=dtypes.JSON_DTYPE, - ) - pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), - check_dtype=False, - check_index_type=False, - ) - - -def test_blob_exif_verbose( - bq_connection: str, - session: bigframes.Session, -): - exif_image_df = session.from_glob_path( - "gs://bigframes_blob_test/images_exif/*", - name="blob_col", - connection=bq_connection, - ) - - actual = exif_image_df["blob_col"].blob.exif( - engine="pillow", connection=bq_connection, verbose=True - ) - assert hasattr(actual, "struct") - actual_exploded = actual.struct.explode() - assert "status" in actual_exploded.columns - assert "content" in actual_exploded.columns - - status_series = actual_exploded["status"] - assert status_series.dtype == dtypes.STRING_DTYPE - - content_series = actual_exploded["content"] - assert content_series.dtype == dtypes.JSON_DTYPE - - def test_blob_image_blur_to_series( images_mm_df: bpd.DataFrame, bq_connection: str, From ecc270c383e53ad89dd08a44d3b1df8f35d19383 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 4 Feb 2026 22:57:28 +0000 Subject: [PATCH 8/8] docs: update notebook --- .../multimodal/image_metadata_exif.ipynb | 319 ------------------ .../multimodal/multimodal_dataframe.ipynb | 94 +++++- 2 files changed, 92 insertions(+), 321 deletions(-) delete mode 100644 notebooks/multimodal/image_metadata_exif.ipynb diff --git a/notebooks/multimodal/image_metadata_exif.ipynb b/notebooks/multimodal/image_metadata_exif.ipynb deleted file mode 100644 index 41dea4b6ba..0000000000 --- a/notebooks/multimodal/image_metadata_exif.ipynb +++ /dev/null @@ -1,319 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Image Metadata (EXIF) Extraction with BigFrames\n", - "\n", - "This notebook demonstrates how to extract EXIF metadata from images stored in Google Cloud Storage using BigQuery DataFrames (BigFrames) user-defined functions (UDFs).\n", - "\n", - "## Setup\n", - "\n", - "Please provide your project ID and location. The notebook uses the default BigFrames connection and a sample dataset name by default." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes.pandas as bpd\n", - "import bigframes.bigquery as bbq\n", - "\n", - "# @title Configuration\n", - "PROJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}\n", - "LOCATION = \"us\" # @param {type:\"string\"}\n", - "\n", - "# Dataset where the UDF will be created.\n", - "DATASET_ID = \"bigframes_samples\" # @param {type:\"string\"}\n", - "\n", - "# A BigQuery connection is required for the UDF to access Google Cloud Storage.\n", - "# \"bigframes-default-connection\" is the default connection created by BigFrames.\n", - "CONNECTION_ID = \"bigframes-default-connection\" # @param {type:\"string\"}\n", - "\n", - "# Construct the canonical connection ID\n", - "FULL_CONNECTION_ID = f\"{PROJECT_ID}.{LOCATION}.{CONNECTION_ID}\"\n", - "\n", - "# Initialize BigFrames\n", - "bpd.options.bigquery.project = PROJECT_ID\n", - "bpd.options.bigquery.location = LOCATION" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define the EXIF Extraction UDF\n", - "\n", - "We will define a BigQuery remote UDF that takes a BigQuery `ObjectRef` runtime JSON string, downloads the image, and extracts EXIF data using the `Pillow` library.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:150: PreviewWarning: udf is in preview.\n", - " return global_session.with_default_session(\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " Query processed 0 Bytes in a moment of slot time. [Job bigframes-dev:US.8160cd8b-7a06-4eb6-9cb7-4b5cc72e96b9 details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "@bpd.udf(\n", - " input_types=[str],\n", - " output_type=str,\n", - " dataset=DATASET_ID,\n", - " name=\"extract_exif\",\n", - " bigquery_connection=FULL_CONNECTION_ID,\n", - " packages=[\"pillow\", \"requests\"],\n", - " max_batching_rows=8192,\n", - " container_cpu=0.33,\n", - " container_memory=\"512Mi\"\n", - ")\n", - "def extract_exif(src_obj_ref_rt: str) -> str:\n", - " import io\n", - " import json\n", - " from PIL import ExifTags, Image\n", - " import requests\n", - " from requests import adapters\n", - " session = requests.Session()\n", - " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", - " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", - " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", - " response = session.get(src_url, timeout=30)\n", - " bts = response.content\n", - " image = Image.open(io.BytesIO(bts))\n", - " exif_data = image.getexif()\n", - " exif_dict = {}\n", - " if exif_data:\n", - " for tag, value in exif_data.items():\n", - " tag_name = ExifTags.TAGS.get(tag, tag)\n", - " exif_dict[tag_name] = value\n", - " return json.dumps(exif_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Extract EXIF from Images\n", - "\n", - "Now we can use this function on a BigFrames Series of image URIs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/utils.py:228: PreviewWarning: The JSON-related API `parse_json` is in preview. Its behavior may\n", - "change in future versions.\n", - " warnings.warn(bfe.format_message(msg), category=bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " Query started. [Job bigframes-dev:US.071a66b1-7952-4126-a750-4febe1277721 details]
SQL
SELECT\n",
-       "`bfuid_col_107` AS `bfuid_col_107`,\n",
-       "`bfuid_col_113` AS `bfuid_col_113`,\n",
-       "`bfuid_col_114` AS `bfuid_col_114`\n",
-       "FROM\n",
-       "(SELECT\n",
-       "  `t2`.`uri`,\n",
-       "  `t2`.`generation`,\n",
-       "  `t2`.`content_type`,\n",
-       "  `t2`.`size`,\n",
-       "  `t2`.`md5_hash`,\n",
-       "  `t2`.`updated`,\n",
-       "  `t2`.`metadata`,\n",
-       "  `t2`.`ref`,\n",
-       "  `t2`.`bfuid_col_115`,\n",
-       "  `t2`.`bfuid_col_107`,\n",
-       "  `t2`.`bfuid_col_108`,\n",
-       "  `t2`.`bfuid_col_109`,\n",
-       "  `t2`.`bfuid_col_110`,\n",
-       "  `t2`.`bfuid_col_111`,\n",
-       "  `t2`.`bfuid_col_112`,\n",
-       "  PARSE_JSON(`t2`.`bfuid_col_112`) AS `bfuid_col_113`,\n",
-       "  `t2`.`bfuid_col_107` AS `bfuid_col_114`\n",
-       "FROM (\n",
-       "  SELECT\n",
-       "    `t1`.`uri`,\n",
-       "    `t1`.`generation`,\n",
-       "    `t1`.`content_type`,\n",
-       "    `t1`.`size`,\n",
-       "    `t1`.`md5_hash`,\n",
-       "    `t1`.`updated`,\n",
-       "    `t1`.`metadata`,\n",
-       "    `t1`.`ref`,\n",
-       "    CONCAT(\n",
-       "      CAST(FARM_FINGERPRINT(\n",
-       "        CONCAT(\n",
-       "          CONCAT('\\\\', REPLACE(COALESCE(`t1`.`uri`, ''), '\\\\', '\\\\\\\\')),\n",
-       "          CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`generation` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
-       "          CONCAT('\\\\', REPLACE(COALESCE(`t1`.`content_type`, ''), '\\\\', '\\\\\\\\')),\n",
-       "          CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`size` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
-       "          CONCAT('\\\\', REPLACE(COALESCE(`t1`.`md5_hash`, ''), '\\\\', '\\\\\\\\')),\n",
-       "          CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`updated` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
-       "          CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`metadata`), ''), '\\\\', '\\\\\\\\')),\n",
-       "          CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`ref`), ''), '\\\\', '\\\\\\\\'))\n",
-       "        )\n",
-       "      ) AS STRING),\n",
-       "      CAST(FARM_FINGERPRINT(\n",
-       "        CONCAT(\n",
-       "          CONCAT(\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(`t1`.`uri`, ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`generation` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(`t1`.`content_type`, ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`size` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(`t1`.`md5_hash`, ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`updated` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`metadata`), ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`ref`), ''), '\\\\', '\\\\\\\\'))\n",
-       "          ),\n",
-       "          '_'\n",
-       "        )\n",
-       "      ) AS STRING),\n",
-       "      CAST(RAND() AS STRING)\n",
-       "    ) AS `bfuid_col_115`,\n",
-       "    ROW_NUMBER() OVER (\n",
-       "      ORDER BY CONCAT(\n",
-       "        CAST(FARM_FINGERPRINT(\n",
-       "          CONCAT(\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(`t1`.`uri`, ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`generation` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(`t1`.`content_type`, ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`size` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(`t1`.`md5_hash`, ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`updated` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`metadata`), ''), '\\\\', '\\\\\\\\')),\n",
-       "            CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`ref`), ''), '\\\\', '\\\\\\\\'))\n",
-       "          )\n",
-       "        ) AS STRING),\n",
-       "        CAST(FARM_FINGERPRINT(\n",
-       "          CONCAT(\n",
-       "            CONCAT(\n",
-       "              CONCAT('\\\\', REPLACE(COALESCE(`t1`.`uri`, ''), '\\\\', '\\\\\\\\')),\n",
-       "              CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`generation` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
-       "              CONCAT('\\\\', REPLACE(COALESCE(`t1`.`content_type`, ''), '\\\\', '\\\\\\\\')),\n",
-       "              CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`size` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
-       "              CONCAT('\\\\', REPLACE(COALESCE(`t1`.`md5_hash`, ''), '\\\\', '\\\\\\\\')),\n",
-       "              CONCAT('\\\\', REPLACE(COALESCE(CAST(`t1`.`updated` AS STRING), ''), '\\\\', '\\\\\\\\')),\n",
-       "              CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`metadata`), ''), '\\\\', '\\\\\\\\')),\n",
-       "              CONCAT('\\\\', REPLACE(COALESCE(to_json_string(`t1`.`ref`), ''), '\\\\', '\\\\\\\\'))\n",
-       "            ),\n",
-       "            '_'\n",
-       "          )\n",
-       "        ) AS STRING),\n",
-       "        CAST(RAND() AS STRING)\n",
-       "      ) ASC\n",
-       "    ) - 1 AS `bfuid_col_107`,\n",
-       "    `OBJ.MAKE_REF`(`t1`.`uri`, 'bigframes-dev.US.bigframes-default-connection') AS `bfuid_col_108`,\n",
-       "    `OBJ.GET_ACCESS_URL`(`OBJ.MAKE_REF`(`t1`.`uri`, 'bigframes-dev.US.bigframes-default-connection'), 'R') AS `bfuid_col_109`,\n",
-       "    to_json_string(\n",
-       "      `OBJ.GET_ACCESS_URL`(`OBJ.MAKE_REF`(`t1`.`uri`, 'bigframes-dev.US.bigframes-default-connection'), 'R')\n",
-       "    ) AS `bfuid_col_110`,\n",
-       "    to_json_string(\n",
-       "      `OBJ.GET_ACCESS_URL`(`OBJ.MAKE_REF`(`t1`.`uri`, 'bigframes-dev.US.bigframes-default-connection'), 'R')\n",
-       "    ) AS `bfuid_col_111`,\n",
-       "    `bigframes-dev.bigframes_samples.extract_exif`(\n",
-       "      to_json_string(\n",
-       "        `OBJ.GET_ACCESS_URL`(`OBJ.MAKE_REF`(`t1`.`uri`, 'bigframes-dev.US.bigframes-default-connection'), 'R')\n",
-       "      )\n",
-       "    ) AS `bfuid_col_112`\n",
-       "  FROM (\n",
-       "    SELECT\n",
-       "      *\n",
-       "    FROM `bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20260204_session9a8d0a_8b743a91e1ad4be5bc629e595991d905` AS `t0`\n",
-       "  ) AS `t1`\n",
-       ") AS `t2`)
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Create a Multimodal DataFrame from the sample image URIs\n", - "exif_image_df = bpd.from_glob_path(\n", - " \"gs://bigframes_blob_test/images_exif/*\",\n", - " name=\"blob_col\",\n", - ")\n", - "\n", - "# Generate a JSON string containing the runtime information (including signed read URLs)\n", - "# This allows the UDF to download the images from Google Cloud Storage\n", - "access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n", - "\n", - "# Apply the BigQuery Python UDF to the runtime JSON strings\n", - "# We cast to string to ensure the input matches the UDF's signature\n", - "exif_json = access_urls.astype(str).apply(extract_exif)\n", - "\n", - "# Parse the resulting JSON strings back into a structured JSON type for easier access\n", - "actual = bbq.parse_json(exif_json)\n", - "\n", - "actual" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index 0822ee4c2d..bb5733bde2 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -61,7 +61,8 @@ "3. Conduct image transformations\n", "4. Use LLM models to ask questions and generate embeddings on images\n", "5. PDF chunking function\n", - "6. Transcribe audio" + "6. Transcribe audio\n", + "7. Extract EXIF metadata from images" ] }, { @@ -104,6 +105,11 @@ "PROJECT = \"bigframes-dev\" # replace with your project. \n", "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n", "\n", + "LOCATION = \"us\" # replace with your location.\n", + "\n", + "# Dataset where the UDF will be created.\n", + "DATASET_ID = \"bigframes_samples\" # replace with your dataset ID.\n", + "\n", "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n", "# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n", "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n", @@ -112,12 +118,14 @@ "import bigframes\n", "# Setup project\n", "bigframes.options.bigquery.project = PROJECT\n", + "bigframes.options.bigquery.location = LOCATION\n", "\n", "# Display options\n", "bigframes.options.display.blob_display_width = 300\n", "bigframes.options.display.progress_bar = None\n", "\n", - "import bigframes.pandas as bpd" + "import bigframes.pandas as bpd\n", + "import bigframes.bigquery as bbq" ] }, { @@ -1546,6 +1554,88 @@ "transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n", "transcribed_series_verbose" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Extract EXIF metadata from images" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Construct the canonical connection ID\n", + "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", + "\n", + "@bpd.udf(\n", + " input_types=[str],\n", + " output_type=str,\n", + " dataset=DATASET_ID,\n", + " name=\"extract_exif\",\n", + " bigquery_connection=FULL_CONNECTION_ID,\n", + " packages=[\"pillow\", \"requests\"],\n", + " max_batching_rows=8192,\n", + " container_cpu=0.33,\n", + " container_memory=\"512Mi\"\n", + ")\n", + "def extract_exif(src_obj_ref_rt: str) -> str:\n", + " import io\n", + " import json\n", + " from PIL import ExifTags, Image\n", + " import requests\n", + " from requests import adapters\n", + " session = requests.Session()\n", + " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", + " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", + " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", + " response = session.get(src_url, timeout=30)\n", + " bts = response.content\n", + " image = Image.open(io.BytesIO(bts))\n", + " exif_data = image.getexif()\n", + " exif_dict = {}\n", + " if exif_data:\n", + " for tag, value in exif_data.items():\n", + " tag_name = ExifTags.TAGS.get(tag, tag)\n", + " exif_dict[tag_name] = value\n", + " return json.dumps(exif_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a Multimodal DataFrame from the sample image URIs\n", + "exif_image_df = bpd.from_glob_path(\n", + " \"gs://bigframes_blob_test/images_exif/*\",\n", + " name=\"blob_col\",\n", + ")\n", + "\n", + "# Generate a JSON string containing the runtime information (including signed read URLs)\n", + "# This allows the UDF to download the images from Google Cloud Storage\n", + "access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n", + "\n", + "# Apply the BigQuery Python UDF to the runtime JSON strings\n", + "# We cast to string to ensure the input matches the UDF's signature\n", + "exif_json = access_urls.astype(str).apply(extract_exif)\n", + "\n", + "# Parse the resulting JSON strings back into a structured JSON type for easier access\n", + "exif_data = bbq.parse_json(exif_json)\n", + "\n", + "exif_data" + ] } ], "metadata": {