feat: Deprecate blob.exif and replace with sample notebook #2429

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

shuoweil wants to merge 11 commits into main from shuowei-blob-deprecate-exif

bigframes/blob/_functions.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -126,57 +126,7 @@ def udf(self): @@
             return self._session.read_gbq_function(udf_name)
-    def exif_func(src_obj_ref_rt: str, verbose: bool) -> str:
-        try:
-            import io
-            import json
-            from PIL import ExifTags, Image
-            import requests
-            from requests import adapters
-            session = requests.Session()
-            session.mount("https://", adapters.HTTPAdapter(max_retries=3))
-            src_obj_ref_rt_json = json.loads(src_obj_ref_rt)
-            src_url = src_obj_ref_rt_json["access_urls"]["read_url"]
-            response = session.get(src_url, timeout=30)
-            response.raise_for_status()
-            bts = response.content
-            image = Image.open(io.BytesIO(bts))
-            exif_data = image.getexif()
-            exif_dict = {}
-            if exif_data:
-                for tag, value in exif_data.items():
-                    tag_name = ExifTags.TAGS.get(tag, tag)
-                    # Convert non-serializable types to strings
-                    try:
-                        json.dumps(value)
-                        exif_dict[tag_name] = value
-                    except (TypeError, ValueError):
-                        exif_dict[tag_name] = str(value)
-            if verbose:
-                return json.dumps({"status": "", "content": json.dumps(exif_dict)})
-            else:
-                return json.dumps(exif_dict)
-        except Exception as e:
-            # Return error as JSON with error field
-            error_result = {"status": f"{type(e).__name__}: {str(e)}", "content": "{}"}
-            if verbose:
-                return json.dumps(error_result)
-            else:
-                return "{}"
-    exif_func_def = FunctionDef(exif_func, ["pillow", "requests"])
-    # Blur images. Takes ObjectRefRuntime as JSON string. Outputs ObjectRefRuntime JSON string.
+    # Blur images. Takes ObjectRefRuntime as JSON string. Outputs ObjectRefRuntime JSON string.                             │
     def image_blur_func(
         src_obj_ref_rt: str,
         dst_obj_ref_rt: str,
@@ Expand Down @@

bigframes/operations/blob.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -336,76 +336,6 @@ def get_runtime_json_str( @@
             runtime = self._get_runtime(mode=mode, with_metadata=with_metadata)
             return runtime._apply_unary_op(ops.ToJSONString())
-        def exif(
-            self,
-            *,
-            engine: Literal[None, "pillow"] = None,
-            connection: Optional[str] = None,
-            max_batching_rows: int = 8192,
-            container_cpu: Union[float, int] = 0.33,
-            container_memory: str = "512Mi",
-            verbose: bool = False,
-        ) -> bigframes.series.Series:
-            """Extract EXIF data. Now only support image types.
-            Args:
-                engine ('pillow' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
-                connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session.
-                max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function.
-                container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
-                container_memory (str, default "512Mi"): container memory size. String of the format <number><unit>. Possible values are from 512Mi to 32Gi.
-                verbose (bool, default False): If True, returns a struct with status and content fields. If False, returns only the content.
-            Returns:
-                bigframes.series.Series: JSON series of key-value pairs if verbose=False, or struct with status and content if verbose=True.
-            Raises:
-                ValueError: If engine is not 'pillow'.
-                RuntimeError: If EXIF extraction fails or returns invalid structure.
-            """
-            if engine is None or engine.casefold() != "pillow":
-                raise ValueError("Must specify the engine, supported value is 'pillow'.")
-            import bigframes.bigquery as bbq
-            import bigframes.blob._functions as blob_func
-            import bigframes.pandas as bpd
-            connection = self._resolve_connection(connection)
-            df = self.get_runtime_json_str(mode="R").to_frame()
-            df["verbose"] = verbose
-            exif_udf = blob_func.TransformFunction(
-                blob_func.exif_func_def,
-                session=self._data._block.session,
-                connection=connection,
-                max_batching_rows=max_batching_rows,
-                container_cpu=container_cpu,
-                container_memory=container_memory,
-            ).udf()
-            res = self._apply_udf_or_raise_error(df, exif_udf, "EXIF extraction")
-            if verbose:
-                try:
-                    exif_content_series = bbq.parse_json(
-                        res._apply_unary_op(ops.JSONValue(json_path="$.content"))
-                    ).rename("exif_content")
-                    exif_status_series = res._apply_unary_op(
-                        ops.JSONValue(json_path="$.status")
-                    )
-                except Exception as e:
-                    raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e
-                results_df = bpd.DataFrame(
-                    {"status": exif_status_series, "content": exif_content_series}
-                )
-                results_struct = bbq.struct(results_df).rename("exif_results")
-                return results_struct
-            else:
-                try:
-                    return bbq.parse_json(res)
-                except Exception as e:
-                    raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e
         def image_blur(
             self,
             ksize: tuple[int, int],
@@ Expand Down @@

notebooks/multimodal/multimodal_dataframe.ipynb

-Original file line number
+Diff line change
@@ Expand Up / @@ -61,7 +61,8 @@ @@
             "3. Conduct image transformations\n",
             "4. Use LLM models to ask questions and generate embeddings on images\n",
             "5. PDF chunking function\n",
-            "6. Transcribe audio"
+            "6. Transcribe audio\n",
+            "7. Extract EXIF metadata from images"
           ]
         },
         {
@@ Expand Down Expand Up / @@ -104,6 +105,11 @@ @@
             "PROJECT = \"bigframes-dev\" # replace with your project. \n",
             "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n",
             "\n",
+            "LOCATION = \"us\" # replace with your location.\n",
+            "\n",
+            "# Dataset where the UDF will be created.\n",
+            "DATASET_ID = \"bigframes_samples\" # replace with your dataset ID.\n",
+            "\n",
             "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n",
             "# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n",
             "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n",
@@ Expand All / @@ -112,12 +118,14 @@ @@
             "import bigframes\n",
             "# Setup project\n",
             "bigframes.options.bigquery.project = PROJECT\n",
+            "bigframes.options.bigquery.location = LOCATION\n",
             "\n",
             "# Display options\n",
             "bigframes.options.display.blob_display_width = 300\n",
             "bigframes.options.display.progress_bar = None\n",
             "\n",
-            "import bigframes.pandas as bpd"
+            "import bigframes.pandas as bpd\n",
+            "import bigframes.bigquery as bbq"
           ]
         },
         {
@@ Expand Down Expand Up / @@ -1546,6 +1554,88 @@ @@
             "transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n",
             "transcribed_series_verbose"
           ]
+        },
+        {
+          "cell_type": "markdown",
+          "metadata": {},
+          "source": [
+            "### 7. Extract EXIF metadata from images"
+          ]
+        },
+        {
+          "cell_type": "markdown",
+          "metadata": {},
+          "source": [
+            "This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library."
+          ]
+        },
+        {
+          "cell_type": "code",
+          "execution_count": null,
+          "metadata": {},
+          "outputs": [],
+          "source": [
+            "# Construct the canonical connection ID\n",
+            "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
+            "\n",
+            "@bpd.udf(\n",
+            "    input_types=[str],\n",
+            "    output_type=str,\n",
+            "    dataset=DATASET_ID,\n",
+            "    name=\"extract_exif\",\n",
+            "    bigquery_connection=FULL_CONNECTION_ID,\n",
+            "    packages=[\"pillow\", \"requests\"],\n",
+            "    max_batching_rows=8192,\n",
+            "    container_cpu=0.33,\n",
+            "    container_memory=\"512Mi\"\n",
+            ")\n",
+            "def extract_exif(src_obj_ref_rt: str) -> str:\n",
+            "    import io\n",
+            "    import json\n",
+            "    from PIL import ExifTags, Image\n",
+            "    import requests\n",
+            "    from requests import adapters\n",
+            "    session = requests.Session()\n",
+            "    session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
+            "    src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
+            "    src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
+            "    response = session.get(src_url, timeout=30)\n",
+            "    bts = response.content\n",
+            "    image = Image.open(io.BytesIO(bts))\n",
+            "    exif_data = image.getexif()\n",
+            "    exif_dict = {}\n",
+            "    if exif_data:\n",
+            "        for tag, value in exif_data.items():\n",
+            "            tag_name = ExifTags.TAGS.get(tag, tag)\n",
+            "            exif_dict[tag_name] = value\n",
+            "    return json.dumps(exif_dict)"
+          ]
+        },
+        {
+          "cell_type": "code",
+          "execution_count": null,
+          "metadata": {},
+          "outputs": [],
+          "source": [
+            "# Create a Multimodal DataFrame from the sample image URIs\n",
+            "exif_image_df = bpd.from_glob_path(\n",
+            "    \"gs://bigframes_blob_test/images_exif/*\",\n",
+            "    name=\"blob_col\",\n",
+            ")\n",
+            "\n",
+            "# Generate a JSON string containing the runtime information (including signed read URLs)\n",
+            "# This allows the UDF to download the images from Google Cloud Storage\n",
+            "access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n",
+            "\n",
+            "# Apply the BigQuery Python UDF to the runtime JSON strings\n",
+            "# We cast to string to ensure the input matches the UDF's signature\n",
+            "exif_json = access_urls.astype(str).apply(extract_exif)\n",
+            "\n",
+            "# Parse the resulting JSON strings back into a structured JSON type for easier access\n",
+            "exif_data = bbq.parse_json(exif_json)\n",
+            "\n",
+            "exif_data"
+          ]
         }
       ],
       "metadata": {
@@ Expand Down @@

tests/system/large/blob/test_function.py

-Original file line number
+Diff line change
@@ Expand Up @@
         ]
-    def test_blob_exif(
-        bq_connection: str,
-        session: bigframes.Session,
-    ):
-        exif_image_df = session.from_glob_path(
-            "gs://bigframes_blob_test/images_exif/*",
-            name="blob_col",
-            connection=bq_connection,
-        )
-        actual = exif_image_df["blob_col"].blob.exif(
-            engine="pillow", connection=bq_connection, verbose=False
-        )
-        expected = bpd.Series(
-            ['{"ExifOffset": 47, "Make": "MyCamera"}'],
-            session=session,
-            dtype=dtypes.JSON_DTYPE,
-        )
-        pd.testing.assert_series_equal(
-            actual.to_pandas(),
-            expected.to_pandas(),
-            check_dtype=False,
-            check_index_type=False,
-        )
-    def test_blob_exif_verbose(
-        bq_connection: str,
-        session: bigframes.Session,
-    ):
-        exif_image_df = session.from_glob_path(
-            "gs://bigframes_blob_test/images_exif/*",
-            name="blob_col",
-            connection=bq_connection,
-        )
-        actual = exif_image_df["blob_col"].blob.exif(
-            engine="pillow", connection=bq_connection, verbose=True
-        )
-        assert hasattr(actual, "struct")
-        actual_exploded = actual.struct.explode()
-        assert "status" in actual_exploded.columns
-        assert "content" in actual_exploded.columns
-        status_series = actual_exploded["status"]
-        assert status_series.dtype == dtypes.STRING_DTYPE
-        content_series = actual_exploded["content"]
-        assert content_series.dtype == dtypes.JSON_DTYPE
     def test_blob_image_blur_to_series(
         images_mm_df: bpd.DataFrame,
         bq_connection: str,
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: Deprecate blob.exif and replace with sample notebook #2429

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

feat: Deprecate blob.exif and replace with sample notebook #2429

Are you sure you want to change the base?

Uh oh!

feat: Deprecate blob.exif and replace with sample notebook #2429

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!