diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 1f0ca592e5..a9fd492f98 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -16,6 +16,7 @@ import dataclasses import functools import itertools +import json from typing import cast, Literal, Optional, Sequence, Tuple, Type, TYPE_CHECKING import pandas as pd @@ -429,7 +430,68 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: @compile_op.register(json_ops.JSONDecode) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: assert isinstance(op, json_ops.JSONDecode) - return input.str.json_decode(_DTYPE_MAPPING[op.to_type]) + target_dtype = _bigframes_dtype_to_polars_dtype(op.to_type) + if op.safe: + # Polars does not support safe JSON decoding (returning null on failure). + # We use map_elements to provide safe JSON decoding. + def safe_decode(val): + if val is None: + return None + try: + decoded = json.loads(val) + except Exception: + return None + + if decoded is None: + return None + + if op.to_type == bigframes.dtypes.INT_DTYPE: + if type(decoded) is bool: + return None + if isinstance(decoded, int): + return decoded + if isinstance(decoded, float): + if decoded.is_integer(): + return int(decoded) + if isinstance(decoded, str): + try: + return int(decoded) + except Exception: + pass + return None + + if op.to_type == bigframes.dtypes.FLOAT_DTYPE: + if type(decoded) is bool: + return None + if isinstance(decoded, (int, float)): + return float(decoded) + if isinstance(decoded, str): + try: + return float(decoded) + except Exception: + pass + return None + + if op.to_type == bigframes.dtypes.BOOL_DTYPE: + if isinstance(decoded, bool): + return decoded + if isinstance(decoded, str): + if decoded.lower() == "true": + return True + if decoded.lower() == "false": + return False + return None + + if op.to_type == bigframes.dtypes.STRING_DTYPE: + if isinstance(decoded, str): + return decoded + return None + + return decoded + + return input.map_elements(safe_decode, return_dtype=target_dtype) + + return input.str.json_decode(target_dtype) @compile_op.register(arr_ops.ToArrayOp) def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr: diff --git a/bigframes/core/compile/polars/lowering.py b/bigframes/core/compile/polars/lowering.py index bf617d6879..5f80904b3b 100644 --- a/bigframes/core/compile/polars/lowering.py +++ b/bigframes/core/compile/polars/lowering.py @@ -391,7 +391,7 @@ def _lower_cast(cast_op: ops.AsTypeOp, arg: expression.Expression): return arg if arg.output_type == dtypes.JSON_DTYPE: - return json_ops.JSONDecode(cast_op.to_type).as_expr(arg) + return json_ops.JSONDecode(cast_op.to_type, safe=cast_op.safe).as_expr(arg) if ( arg.output_type == dtypes.STRING_DTYPE and cast_op.to_type == dtypes.DATETIME_DTYPE diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index be0d2b45d0..9bd20fce43 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -133,25 +133,26 @@ def _initial_load(self) -> None: # obtain the row counts # TODO(b/428238610): Start iterating over the result of `to_pandas_batches()` # before we get here so that the count might already be cached. - self._reset_batches_for_new_page_size() + with bigframes.option_context("display.progress_bar", None): + self._reset_batches_for_new_page_size() - if self._batches is None: - self._error_message = ( - "Could not retrieve data batches. Data might be unavailable or " - "an error occurred." - ) - self.row_count = None - elif self._batches.total_rows is None: - # Total rows is unknown, this is an expected state. - # TODO(b/461536343): Cheaply discover if we have exactly 1 page. - # There are cases where total rows is not set, but there are no additional - # pages. We could disable the "next" button in these cases. - self.row_count = None - else: - self.row_count = self._batches.total_rows + if self._batches is None: + self._error_message = ( + "Could not retrieve data batches. Data might be unavailable or " + "an error occurred." + ) + self.row_count = None + elif self._batches.total_rows is None: + # Total rows is unknown, this is an expected state. + # TODO(b/461536343): Cheaply discover if we have exactly 1 page. + # There are cases where total rows is not set, but there are no additional + # pages. We could disable the "next" button in these cases. + self.row_count = None + else: + self.row_count = self._batches.total_rows - # get the initial page - self._set_table_html() + # get the initial page + self._set_table_html() @traitlets.observe("_initial_load_complete") def _on_initial_load_complete(self, change: dict[str, Any]): @@ -281,7 +282,9 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" new_page = None - with self._setting_html_lock: + with self._setting_html_lock, bigframes.option_context( + "display.progress_bar", None + ): if self._error_message: self.table_html = ( f"
" diff --git a/bigframes/display/html.py b/bigframes/display/html.py index 6102d1512c..2bf847a259 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -363,7 +363,8 @@ def repr_mimebundle( if opts.repr_mode == "anywidget": try: - return get_anywidget_bundle(obj, include=include, exclude=exclude) + with bigframes.option_context("display.progress_bar", None): + return get_anywidget_bundle(obj, include=include, exclude=exclude) except ImportError: # Anywidget is an optional dependency, so warn rather than fail. # TODO(shuowei): When Anywidget becomes the default for all repr modes, diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index 7260a79223..3d3ccfef11 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -220,6 +220,7 @@ def output_type(self, *input_types): class JSONDecode(base_ops.UnaryOp): name: typing.ClassVar[str] = "json_decode" to_type: dtypes.Dtype + safe: bool = False def output_type(self, *input_types): input_type = input_types[0] diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index 575beff8fc..ead0b0591b 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -34,6 +34,7 @@ numeric_ops, string_ops, ) +import bigframes.operations.json_ops as json_ops from bigframes.session import executor, semi_executor if TYPE_CHECKING: @@ -94,6 +95,7 @@ string_ops.EndsWithOp, string_ops.StrContainsOp, string_ops.StrContainsRegexOp, + json_ops.JSONDecode, ) _COMPATIBLE_AGG_OPS = ( agg_ops.SizeOp, diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index e9491610ac..3bc12617fc 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "d10bfca4", "metadata": {}, "outputs": [], @@ -91,7 +91,9 @@ "outputs": [ { "data": { - "text/html": [], + "text/html": [ + "Starting." + ], "text/plain": [ "" ] @@ -117,17 +119,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "state gender year name number\n", - " AL F 1910 Lillian 99\n", - " AL F 1910 Ruby 204\n", - " AL F 1910 Helen 76\n", - " AL F 1910 Eunice 41\n", - " AR F 1910 Dora 42\n", - " CA F 1910 Edna 62\n", - " CA F 1910 Helen 239\n", - " CO F 1910 Alice 46\n", - " FL F 1910 Willie 71\n", - " FL F 1910 Thelma 65\n", + "state gender year name number\n", + " AL F 1910 Annie 482\n", + " AL F 1910 Myrtle 104\n", + " AR F 1910 Lillian 56\n", + " CT F 1910 Anne 38\n", + " CT F 1910 Frances 45\n", + " FL F 1910 Margaret 53\n", + " GA F 1910 Mae 73\n", + " GA F 1910 Beatrice 96\n", + " GA F 1910 Lola 47\n", + " IA F 1910 Viola 49\n", "...\n", "\n", "[5552452 rows x 5 columns]\n" @@ -145,30 +147,10 @@ "id": "220340b0", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6fb22be7f21f4d1dacd76dc62a1a7818", + "model_id": "c74c3719ba43489890185b5c9880acfc", "version_major": 2, "version_minor": 1 }, @@ -204,80 +186,80 @@ " AL\n", " F\n", " 1910\n", - " Lillian\n", - " 99\n", + " Hazel\n", + " 51\n", " \n", " \n", " 1\n", " AL\n", " F\n", " 1910\n", - " Ruby\n", - " 204\n", + " Lucy\n", + " 76\n", " \n", " \n", " 2\n", - " AL\n", + " AR\n", " F\n", " 1910\n", - " Helen\n", - " 76\n", + " Nellie\n", + " 39\n", " \n", " \n", " 3\n", - " AL\n", + " AR\n", " F\n", " 1910\n", - " Eunice\n", - " 41\n", + " Lena\n", + " 40\n", " \n", " \n", " 4\n", - " AR\n", + " CO\n", " F\n", " 1910\n", - " Dora\n", - " 42\n", + " Thelma\n", + " 36\n", " \n", " \n", " 5\n", - " CA\n", + " CO\n", " F\n", " 1910\n", - " Edna\n", - " 62\n", + " Ruth\n", + " 68\n", " \n", " \n", " 6\n", - " CA\n", + " CT\n", " F\n", " 1910\n", - " Helen\n", - " 239\n", + " Elizabeth\n", + " 86\n", " \n", " \n", " 7\n", - " CO\n", + " DC\n", " F\n", " 1910\n", - " Alice\n", - " 46\n", + " Mary\n", + " 80\n", " \n", " \n", " 8\n", " FL\n", " F\n", " 1910\n", - " Willie\n", - " 71\n", + " Annie\n", + " 101\n", " \n", " \n", " 9\n", " FL\n", " F\n", " 1910\n", - " Thelma\n", - " 65\n", + " Alma\n", + " 39\n", " \n", " \n", "\n", @@ -285,17 +267,17 @@ "
[5552452 rows x 5 columns in total]" ], "text/plain": [ - "state gender year name number\n", - " AL F 1910 Lillian 99\n", - " AL F 1910 Ruby 204\n", - " AL F 1910 Helen 76\n", - " AL F 1910 Eunice 41\n", - " AR F 1910 Dora 42\n", - " CA F 1910 Edna 62\n", - " CA F 1910 Helen 239\n", - " CO F 1910 Alice 46\n", - " FL F 1910 Willie 71\n", - " FL F 1910 Thelma 65\n", + "state gender year name number\n", + " AL F 1910 Hazel 51\n", + " AL F 1910 Lucy 76\n", + " AR F 1910 Nellie 39\n", + " AR F 1910 Lena 40\n", + " CO F 1910 Thelma 36\n", + " CO F 1910 Ruth 68\n", + " CT F 1910 Elizabeth 86\n", + " DC F 1910 Mary 80\n", + " FL F 1910 Annie 101\n", + " FL F 1910 Alma 39\n", "...\n", "\n", "[5552452 rows x 5 columns]" @@ -329,7 +311,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 171.4 MB in 41 seconds of slot time. [Job bigframes-dev:US.492b5260-9f44-495c-be09-2ae1324a986c details]\n", + " Query processed 171.4 MB in 35 seconds of slot time. [Job bigframes-dev:US.e15f1b34-e414-42d2-857b-926ea25947c4 details]\n", " " ], "text/plain": [ @@ -355,7 +337,9 @@ }, { "data": { - "text/html": [], + "text/html": [ + "Starting." + ], "text/plain": [ "" ] @@ -404,38 +388,10 @@ "id": "da23e0f3", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_gsx0h2jHoOSYwqGKUS3lAYLf_qi3 details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 88.8 MB in 3 seconds of slot time. [Job bigframes-dev:US.job_1VivAJ2InPdg5RXjWfvAJ1B0oxO3 details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7d82208e7e5e40dd9dbf64c4c561cab3", + "model_id": "2ad9004bda464950ab6eda63b1b86a3a", "version_major": 2, "version_minor": 1 }, @@ -533,34 +489,6 @@ "id": "6920d49b", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 10 seconds of slot time. [Job bigframes-dev:US.job_cmNyG5sJ1IDCyFINx7teExQOZ6UQ details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 8 seconds of slot time. [Job bigframes-dev:US.job_aQvP3Sn04Ss4flSLaLhm0sKzFvrd details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -571,12 +499,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "52d11291ba1d42e6b544acbd86eef6cf", + "model_id": "c1b84125429c4fbc90a22e1adbeea901", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -648,34 +576,6 @@ "id": "a9d5d13a", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -686,12 +586,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "32c61c84740d45a0ac37202a76c7c14e", + "model_id": "23fc730e004b4eec807d2829bffa3ce0", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -752,33 +652,7 @@ "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - }, - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", @@ -792,7 +666,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9d60a47296214553bb10c434b5ee8330", + "model_id": "a3bf6021c6fd44299d317d3b44213b50", "version_major": 2, "version_minor": 1 }, @@ -839,24 +713,6 @@ " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", " DE\n", - " 29.08.018\n", - " E04H 6/12\n", - " <NA>\n", - " 18157874.1\n", - " 21.02.2018\n", - " 22.02.2017\n", - " Liedtke & Partner Patentanw√§lte\n", - " SHB Hebezeugbau GmbH\n", - " VOLGER, Alexander\n", - " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", - " EP 3 366 869 A1\n", - " \n", - " \n", - " 1\n", - " {'application_number': None, 'class_internatio...\n", - " gs://gcs-public-data--labeled-patents/espacene...\n", - " EU\n", - " DE\n", " 03.10.2018\n", " H05B 6/12\n", " <NA>\n", @@ -870,7 +726,7 @@ " EP 3 383 141 A2\n", " \n", " \n", - " 2\n", + " 1\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -888,7 +744,7 @@ " EP 3 382 744 A1\n", " \n", " \n", - " 3\n", + " 2\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -906,7 +762,7 @@ " EP 3 382 553 A1\n", " \n", " \n", - " 4\n", + " 3\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -923,6 +779,24 @@ " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", " EP 3 381 276 A1\n", " \n", + " \n", + " 4\n", + " {'application_number': None, 'class_internatio...\n", + " gs://gcs-public-data--labeled-patents/espacene...\n", + " EU\n", + " DE\n", + " 29.08.018\n", + " E04H 6/12\n", + " <NA>\n", + " 18157874.1\n", + " 21.02.2018\n", + " 22.02.2017\n", + " Liedtke & Partner Patentanw√§lte\n", + " SHB Hebezeugbau GmbH\n", + " VOLGER, Alexander\n", + " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", + " EP 3 366 869 A1\n", + " \n", " \n", "\n", "

5 rows × 15 columns

\n", @@ -944,32 +818,32 @@ "4 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", "\n", " publication_date class_international class_us application_number \\\n", - "0 29.08.018 E04H 6/12 18157874.1 \n", - "1 03.10.2018 H05B 6/12 18165514.3 \n", - "2 03.10.2018 H01L 21/20 18166536.5 \n", - "3 03.10.2018 G06F 11/30 18157347.8 \n", - "4 03.10.2018 A01K 31/00 18171005.4 \n", + "0 03.10.2018 H05B 6/12 18165514.3 \n", + "1 03.10.2018 H01L 21/20 18166536.5 \n", + "2 03.10.2018 G06F 11/30 18157347.8 \n", + "3 03.10.2018 A01K 31/00 18171005.4 \n", + "4 29.08.018 E04H 6/12 18157874.1 \n", "\n", " filing_date priority_date_eu representative_line_1_eu \\\n", - "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanwälte \n", - "1 03.04.2018 30.03.2017 \n", - "2 16.02.2016 Scheider, Sascha et al \n", - "3 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "4 05.02.2015 05.02.2014 Stork Bamberger Patentanwälte \n", + "0 03.04.2018 30.03.2017 \n", + "1 16.02.2016 Scheider, Sascha et al \n", + "2 19.02.2018 31.03.2017 Hoffmann Eitle \n", + "3 05.02.2015 05.02.2014 Stork Bamberger Patentanwälte \n", + "4 21.02.2018 22.02.2017 Liedtke & Partner Patentanwälte \n", "\n", " applicant_line_1 inventor_line_1 \\\n", - "0 SHB Hebezeugbau GmbH VOLGER, Alexander \n", - "1 BSH Hausgeräte GmbH Acero Acero, Jesus \n", - "2 EV Group E. Thallner GmbH Kurz, Florian \n", - "3 FUJITSU LIMITED Kukihara, Kensuke \n", - "4 Linco Food Systems A/S Thrane, Uffe \n", + "0 BSH Hausgeräte GmbH Acero Acero, Jesus \n", + "1 EV Group E. Thallner GmbH Kurz, Florian \n", + "2 FUJITSU LIMITED Kukihara, Kensuke \n", + "3 Linco Food Systems A/S Thrane, Uffe \n", + "4 SHB Hebezeugbau GmbH VOLGER, Alexander \n", "\n", " title_line_1 number \n", - "0 STEUERUNGSSYSTEM FÜR AUTOMATISCHE PARKHÄUSER EP 3 366 869 A1 \n", - "1 VORRICHTUNG ZUR INDUKTIVEN ENERGIEÜBERTRAGUNG EP 3 383 141 A2 \n", - "2 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", - "3 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "4 MASTHÄHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + "0 VORRICHTUNG ZUR INDUKTIVEN ENERGIEÜBERTRAGUNG EP 3 383 141 A2 \n", + "1 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", + "2 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", + "3 MASTHÄHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + "4 STEUERUNGSSYSTEM FÜR AUTOMATISCHE PARKHÄUSER EP 3 366 869 A1 \n", "\n", "[5 rows x 15 columns]" ] diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 516a46d4dd..6f04631264 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -4142,7 +4142,6 @@ def test_json_astype_others_raise_error(data, to_type): bf_series.astype(to_type, errors="raise").to_pandas() -@pytest.mark.skip(reason="AssertionError: Series NA mask are different") @pytest.mark.parametrize( ("data", "to_type"), [