diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 3dfe38811b1..f8a79ca9b0b 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -126,57 +126,7 @@ def udf(self): return self._session.read_gbq_function(udf_name) -def exif_func(src_obj_ref_rt: str, verbose: bool) -> str: - try: - import io - import json - - from PIL import ExifTags, Image - import requests - from requests import adapters - - session = requests.Session() - session.mount("https://", adapters.HTTPAdapter(max_retries=3)) - - src_obj_ref_rt_json = json.loads(src_obj_ref_rt) - src_url = src_obj_ref_rt_json["access_urls"]["read_url"] - - response = session.get(src_url, timeout=30) - response.raise_for_status() - bts = response.content - - image = Image.open(io.BytesIO(bts)) - exif_data = image.getexif() - exif_dict = {} - - if exif_data: - for tag, value in exif_data.items(): - tag_name = ExifTags.TAGS.get(tag, tag) - # Convert non-serializable types to strings - try: - json.dumps(value) - exif_dict[tag_name] = value - except (TypeError, ValueError): - exif_dict[tag_name] = str(value) - - if verbose: - return json.dumps({"status": "", "content": json.dumps(exif_dict)}) - else: - return json.dumps(exif_dict) - - except Exception as e: - # Return error as JSON with error field - error_result = {"status": f"{type(e).__name__}: {str(e)}", "content": "{}"} - if verbose: - return json.dumps(error_result) - else: - return "{}" - - -exif_func_def = FunctionDef(exif_func, ["pillow", "requests"]) - - -# Blur images. Takes ObjectRefRuntime as JSON string. Outputs ObjectRefRuntime JSON string. +# Blur images. Takes ObjectRefRuntime as JSON string. Outputs ObjectRefRuntime JSON string. │ def image_blur_func( src_obj_ref_rt: str, dst_obj_ref_rt: str, diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 9210addaa81..edb4ebf9040 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -336,76 +336,6 @@ def get_runtime_json_str( runtime = self._get_runtime(mode=mode, with_metadata=with_metadata) return runtime._apply_unary_op(ops.ToJSONString()) - def exif( - self, - *, - engine: Literal[None, "pillow"] = None, - connection: Optional[str] = None, - max_batching_rows: int = 8192, - container_cpu: Union[float, int] = 0.33, - container_memory: str = "512Mi", - verbose: bool = False, - ) -> bigframes.series.Series: - """Extract EXIF data. Now only support image types. - - Args: - engine ('pillow' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. - connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. - max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function. - container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. - container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. - verbose (bool, default False): If True, returns a struct with status and content fields. If False, returns only the content. - - Returns: - bigframes.series.Series: JSON series of key-value pairs if verbose=False, or struct with status and content if verbose=True. - - Raises: - ValueError: If engine is not 'pillow'. - RuntimeError: If EXIF extraction fails or returns invalid structure. - """ - if engine is None or engine.casefold() != "pillow": - raise ValueError("Must specify the engine, supported value is 'pillow'.") - - import bigframes.bigquery as bbq - import bigframes.blob._functions as blob_func - import bigframes.pandas as bpd - - connection = self._resolve_connection(connection) - df = self.get_runtime_json_str(mode="R").to_frame() - df["verbose"] = verbose - - exif_udf = blob_func.TransformFunction( - blob_func.exif_func_def, - session=self._data._block.session, - connection=connection, - max_batching_rows=max_batching_rows, - container_cpu=container_cpu, - container_memory=container_memory, - ).udf() - - res = self._apply_udf_or_raise_error(df, exif_udf, "EXIF extraction") - - if verbose: - try: - exif_content_series = bbq.parse_json( - res._apply_unary_op(ops.JSONValue(json_path="$.content")) - ).rename("exif_content") - exif_status_series = res._apply_unary_op( - ops.JSONValue(json_path="$.status") - ) - except Exception as e: - raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e - results_df = bpd.DataFrame( - {"status": exif_status_series, "content": exif_content_series} - ) - results_struct = bbq.struct(results_df).rename("exif_results") - return results_struct - else: - try: - return bbq.parse_json(res) - except Exception as e: - raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e - def image_blur( self, ksize: tuple[int, int], diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index 0822ee4c2db..bb5733bde26 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -61,7 +61,8 @@ "3. Conduct image transformations\n", "4. Use LLM models to ask questions and generate embeddings on images\n", "5. PDF chunking function\n", - "6. Transcribe audio" + "6. Transcribe audio\n", + "7. Extract EXIF metadata from images" ] }, { @@ -104,6 +105,11 @@ "PROJECT = \"bigframes-dev\" # replace with your project. \n", "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n", "\n", + "LOCATION = \"us\" # replace with your location.\n", + "\n", + "# Dataset where the UDF will be created.\n", + "DATASET_ID = \"bigframes_samples\" # replace with your dataset ID.\n", + "\n", "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n", "# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n", "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n", @@ -112,12 +118,14 @@ "import bigframes\n", "# Setup project\n", "bigframes.options.bigquery.project = PROJECT\n", + "bigframes.options.bigquery.location = LOCATION\n", "\n", "# Display options\n", "bigframes.options.display.blob_display_width = 300\n", "bigframes.options.display.progress_bar = None\n", "\n", - "import bigframes.pandas as bpd" + "import bigframes.pandas as bpd\n", + "import bigframes.bigquery as bbq" ] }, { @@ -1546,6 +1554,88 @@ "transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n", "transcribed_series_verbose" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Extract EXIF metadata from images" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Construct the canonical connection ID\n", + "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", + "\n", + "@bpd.udf(\n", + " input_types=[str],\n", + " output_type=str,\n", + " dataset=DATASET_ID,\n", + " name=\"extract_exif\",\n", + " bigquery_connection=FULL_CONNECTION_ID,\n", + " packages=[\"pillow\", \"requests\"],\n", + " max_batching_rows=8192,\n", + " container_cpu=0.33,\n", + " container_memory=\"512Mi\"\n", + ")\n", + "def extract_exif(src_obj_ref_rt: str) -> str:\n", + " import io\n", + " import json\n", + " from PIL import ExifTags, Image\n", + " import requests\n", + " from requests import adapters\n", + " session = requests.Session()\n", + " session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n", + " src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n", + " src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n", + " response = session.get(src_url, timeout=30)\n", + " bts = response.content\n", + " image = Image.open(io.BytesIO(bts))\n", + " exif_data = image.getexif()\n", + " exif_dict = {}\n", + " if exif_data:\n", + " for tag, value in exif_data.items():\n", + " tag_name = ExifTags.TAGS.get(tag, tag)\n", + " exif_dict[tag_name] = value\n", + " return json.dumps(exif_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a Multimodal DataFrame from the sample image URIs\n", + "exif_image_df = bpd.from_glob_path(\n", + " \"gs://bigframes_blob_test/images_exif/*\",\n", + " name=\"blob_col\",\n", + ")\n", + "\n", + "# Generate a JSON string containing the runtime information (including signed read URLs)\n", + "# This allows the UDF to download the images from Google Cloud Storage\n", + "access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n", + "\n", + "# Apply the BigQuery Python UDF to the runtime JSON strings\n", + "# We cast to string to ensure the input matches the UDF's signature\n", + "exif_json = access_urls.astype(str).apply(extract_exif)\n", + "\n", + "# Parse the resulting JSON strings back into a structured JSON type for easier access\n", + "exif_data = bbq.parse_json(exif_json)\n", + "\n", + "exif_data" + ] } ], "metadata": { diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 7963fabd0b6..94b3ce2ca6b 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -52,57 +52,6 @@ def images_output_uris(images_output_folder: str) -> list[str]: ] -def test_blob_exif( - bq_connection: str, - session: bigframes.Session, -): - exif_image_df = session.from_glob_path( - "gs://bigframes_blob_test/images_exif/*", - name="blob_col", - connection=bq_connection, - ) - - actual = exif_image_df["blob_col"].blob.exif( - engine="pillow", connection=bq_connection, verbose=False - ) - expected = bpd.Series( - ['{"ExifOffset": 47, "Make": "MyCamera"}'], - session=session, - dtype=dtypes.JSON_DTYPE, - ) - pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), - check_dtype=False, - check_index_type=False, - ) - - -def test_blob_exif_verbose( - bq_connection: str, - session: bigframes.Session, -): - exif_image_df = session.from_glob_path( - "gs://bigframes_blob_test/images_exif/*", - name="blob_col", - connection=bq_connection, - ) - - actual = exif_image_df["blob_col"].blob.exif( - engine="pillow", connection=bq_connection, verbose=True - ) - assert hasattr(actual, "struct") - actual_exploded = actual.struct.explode() - assert "status" in actual_exploded.columns - assert "content" in actual_exploded.columns - - status_series = actual_exploded["status"] - assert status_series.dtype == dtypes.STRING_DTYPE - - content_series = actual_exploded["content"] - assert content_series.dtype == dtypes.JSON_DTYPE - - def test_blob_image_blur_to_series( images_mm_df: bpd.DataFrame, bq_connection: str,