Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 1 addition & 51 deletions bigframes/blob/_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,57 +126,7 @@ def udf(self):
return self._session.read_gbq_function(udf_name)


def exif_func(src_obj_ref_rt: str, verbose: bool) -> str:
try:
import io
import json

from PIL import ExifTags, Image
import requests
from requests import adapters

session = requests.Session()
session.mount("https://", adapters.HTTPAdapter(max_retries=3))

src_obj_ref_rt_json = json.loads(src_obj_ref_rt)
src_url = src_obj_ref_rt_json["access_urls"]["read_url"]

response = session.get(src_url, timeout=30)
response.raise_for_status()
bts = response.content

image = Image.open(io.BytesIO(bts))
exif_data = image.getexif()
exif_dict = {}

if exif_data:
for tag, value in exif_data.items():
tag_name = ExifTags.TAGS.get(tag, tag)
# Convert non-serializable types to strings
try:
json.dumps(value)
exif_dict[tag_name] = value
except (TypeError, ValueError):
exif_dict[tag_name] = str(value)

if verbose:
return json.dumps({"status": "", "content": json.dumps(exif_dict)})
else:
return json.dumps(exif_dict)

except Exception as e:
# Return error as JSON with error field
error_result = {"status": f"{type(e).__name__}: {str(e)}", "content": "{}"}
if verbose:
return json.dumps(error_result)
else:
return "{}"


exif_func_def = FunctionDef(exif_func, ["pillow", "requests"])


# Blur images. Takes ObjectRefRuntime as JSON string. Outputs ObjectRefRuntime JSON string.
# Blur images. Takes ObjectRefRuntime as JSON string. Outputs ObjectRefRuntime JSON string. │
def image_blur_func(
src_obj_ref_rt: str,
dst_obj_ref_rt: str,
Expand Down
70 changes: 0 additions & 70 deletions bigframes/operations/blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,76 +336,6 @@ def get_runtime_json_str(
runtime = self._get_runtime(mode=mode, with_metadata=with_metadata)
return runtime._apply_unary_op(ops.ToJSONString())

def exif(
self,
*,
engine: Literal[None, "pillow"] = None,
connection: Optional[str] = None,
max_batching_rows: int = 8192,
container_cpu: Union[float, int] = 0.33,
container_memory: str = "512Mi",
verbose: bool = False,
) -> bigframes.series.Series:
"""Extract EXIF data. Now only support image types.

Args:
engine ('pillow' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session.
max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function.
container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
container_memory (str, default "512Mi"): container memory size. String of the format <number><unit>. Possible values are from 512Mi to 32Gi.
verbose (bool, default False): If True, returns a struct with status and content fields. If False, returns only the content.

Returns:
bigframes.series.Series: JSON series of key-value pairs if verbose=False, or struct with status and content if verbose=True.

Raises:
ValueError: If engine is not 'pillow'.
RuntimeError: If EXIF extraction fails or returns invalid structure.
"""
if engine is None or engine.casefold() != "pillow":
raise ValueError("Must specify the engine, supported value is 'pillow'.")

import bigframes.bigquery as bbq
import bigframes.blob._functions as blob_func
import bigframes.pandas as bpd

connection = self._resolve_connection(connection)
df = self.get_runtime_json_str(mode="R").to_frame()
df["verbose"] = verbose

exif_udf = blob_func.TransformFunction(
blob_func.exif_func_def,
session=self._data._block.session,
connection=connection,
max_batching_rows=max_batching_rows,
container_cpu=container_cpu,
container_memory=container_memory,
).udf()

res = self._apply_udf_or_raise_error(df, exif_udf, "EXIF extraction")

if verbose:
try:
exif_content_series = bbq.parse_json(
res._apply_unary_op(ops.JSONValue(json_path="$.content"))
).rename("exif_content")
exif_status_series = res._apply_unary_op(
ops.JSONValue(json_path="$.status")
)
except Exception as e:
raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e
results_df = bpd.DataFrame(
{"status": exif_status_series, "content": exif_content_series}
)
results_struct = bbq.struct(results_df).rename("exif_results")
return results_struct
else:
try:
return bbq.parse_json(res)
except Exception as e:
raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e

def image_blur(
self,
ksize: tuple[int, int],
Expand Down
94 changes: 92 additions & 2 deletions notebooks/multimodal/multimodal_dataframe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"3. Conduct image transformations\n",
"4. Use LLM models to ask questions and generate embeddings on images\n",
"5. PDF chunking function\n",
"6. Transcribe audio"
"6. Transcribe audio\n",
"7. Extract EXIF metadata from images"
]
},
{
Expand Down Expand Up @@ -104,6 +105,11 @@
"PROJECT = \"bigframes-dev\" # replace with your project. \n",
"# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n",
"\n",
"LOCATION = \"us\" # replace with your location.\n",
"\n",
"# Dataset where the UDF will be created.\n",
"DATASET_ID = \"bigframes_samples\" # replace with your dataset ID.\n",
"\n",
"OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n",
"# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n",
"# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n",
Expand All @@ -112,12 +118,14 @@
"import bigframes\n",
"# Setup project\n",
"bigframes.options.bigquery.project = PROJECT\n",
"bigframes.options.bigquery.location = LOCATION\n",
"\n",
"# Display options\n",
"bigframes.options.display.blob_display_width = 300\n",
"bigframes.options.display.progress_bar = None\n",
"\n",
"import bigframes.pandas as bpd"
"import bigframes.pandas as bpd\n",
"import bigframes.bigquery as bbq"
]
},
{
Expand Down Expand Up @@ -1546,6 +1554,88 @@
"transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n",
"transcribed_series_verbose"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7. Extract EXIF metadata from images"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Construct the canonical connection ID\n",
"FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
"\n",
"@bpd.udf(\n",
" input_types=[str],\n",
" output_type=str,\n",
" dataset=DATASET_ID,\n",
" name=\"extract_exif\",\n",
" bigquery_connection=FULL_CONNECTION_ID,\n",
" packages=[\"pillow\", \"requests\"],\n",
" max_batching_rows=8192,\n",
" container_cpu=0.33,\n",
" container_memory=\"512Mi\"\n",
")\n",
"def extract_exif(src_obj_ref_rt: str) -> str:\n",
" import io\n",
" import json\n",
" from PIL import ExifTags, Image\n",
" import requests\n",
" from requests import adapters\n",
" session = requests.Session()\n",
" session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
" src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
" src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
" response = session.get(src_url, timeout=30)\n",
" bts = response.content\n",
" image = Image.open(io.BytesIO(bts))\n",
" exif_data = image.getexif()\n",
" exif_dict = {}\n",
" if exif_data:\n",
" for tag, value in exif_data.items():\n",
" tag_name = ExifTags.TAGS.get(tag, tag)\n",
" exif_dict[tag_name] = value\n",
" return json.dumps(exif_dict)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create a Multimodal DataFrame from the sample image URIs\n",
"exif_image_df = bpd.from_glob_path(\n",
" \"gs://bigframes_blob_test/images_exif/*\",\n",
" name=\"blob_col\",\n",
")\n",
"\n",
"# Generate a JSON string containing the runtime information (including signed read URLs)\n",
"# This allows the UDF to download the images from Google Cloud Storage\n",
"access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n",
"\n",
"# Apply the BigQuery Python UDF to the runtime JSON strings\n",
"# We cast to string to ensure the input matches the UDF's signature\n",
"exif_json = access_urls.astype(str).apply(extract_exif)\n",
"\n",
"# Parse the resulting JSON strings back into a structured JSON type for easier access\n",
"exif_data = bbq.parse_json(exif_json)\n",
"\n",
"exif_data"
]
}
],
"metadata": {
Expand Down
51 changes: 0 additions & 51 deletions tests/system/large/blob/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,57 +52,6 @@ def images_output_uris(images_output_folder: str) -> list[str]:
]


def test_blob_exif(
bq_connection: str,
session: bigframes.Session,
):
exif_image_df = session.from_glob_path(
"gs://bigframes_blob_test/images_exif/*",
name="blob_col",
connection=bq_connection,
)

actual = exif_image_df["blob_col"].blob.exif(
engine="pillow", connection=bq_connection, verbose=False
)
expected = bpd.Series(
['{"ExifOffset": 47, "Make": "MyCamera"}'],
session=session,
dtype=dtypes.JSON_DTYPE,
)
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
check_dtype=False,
check_index_type=False,
)


def test_blob_exif_verbose(
bq_connection: str,
session: bigframes.Session,
):
exif_image_df = session.from_glob_path(
"gs://bigframes_blob_test/images_exif/*",
name="blob_col",
connection=bq_connection,
)

actual = exif_image_df["blob_col"].blob.exif(
engine="pillow", connection=bq_connection, verbose=True
)
assert hasattr(actual, "struct")
actual_exploded = actual.struct.explode()
assert "status" in actual_exploded.columns
assert "content" in actual_exploded.columns

status_series = actual_exploded["status"]
assert status_series.dtype == dtypes.STRING_DTYPE

content_series = actual_exploded["content"]
assert content_series.dtype == dtypes.JSON_DTYPE


def test_blob_image_blur_to_series(
images_mm_df: bpd.DataFrame,
bq_connection: str,
Expand Down
Loading