Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions bigquery_magics/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,8 +633,9 @@ def _handle_result(result, args):


def _colab_query_callback(query: str, params: str):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The query and params arguments are no longer used in this function. It's a good practice to prefix unused variables with an underscore to make the code clearer.

Suggested change
def _colab_query_callback(query: str, params: str):
def _colab_query_callback(_query: str, _params: str):

query_results = json.loads(graph_server.graph_server.query_result.to_json())
return IPython.core.display.JSON(
graph_server.convert_graph_data(query_results=json.loads(params))
graph_server.convert_graph_data(query_results=query_results)
)


Expand Down Expand Up @@ -697,11 +698,13 @@ def _add_graph_widget(query_result):
singleton_server_thread = graph_server.graph_server.init()
port = graph_server.graph_server.port

graph_server.graph_server.query_result = query_result

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Using a single global variable graph_server.graph_server.query_result to pass data to the graph visualizer introduces a potential race condition. If two notebook cells that generate graphs are run in quick succession, the second cell could overwrite query_result before the first cell's visualization has had a chance to fetch its data, leading to the first visualization displaying the second one's data.

A more robust solution would be to avoid this shared state. For example, you could store query results in a dictionary keyed by a unique ID:

  1. Generate a unique ID for each query.
  2. Store the query result in a dictionary on the graph_server object, e.g., graph_server.graph_server.results[query_id] = query_result.
  3. Pass this query_id to the frontend via the params argument of generate_visualization_html.
  4. The frontend callback would then include this query_id when calling back to Python, allowing _colab_query_callback and handle_post_query to retrieve the correct result.

This would make the visualization mechanism stateless and safe for concurrent executions.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a bad suggestion, but I have not encountered a notebook environment that allows more than one cell to run at a time, so it's not a huge concern for me.


# Create html to invoke the graph server
html_content = generate_visualization_html(
query="placeholder query",
port=port,
params=query_result.to_json().replace("\\", "\\\\").replace('"', '\\"'),
params="{}",
)
IPython.display.display(IPython.core.display.HTML(html_content))

Expand Down
6 changes: 4 additions & 2 deletions bigquery_magics/graph_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def __init__(self):
self.port = None
self.url = None
self._server = None
self.query_result = None

def build_route(self, endpoint):
"""
Expand Down Expand Up @@ -250,8 +251,9 @@ def handle_post_ping(self):
self.do_data_response({"your_request": data})

def handle_post_query(self):
data = self.parse_post_data()
response = convert_graph_data(query_results=json.loads(data["params"]))
self.parse_post_data()
query_results = json.loads(graph_server.query_result.to_json())
response = convert_graph_data(query_results=query_results)
self.do_data_response(response)

def handle_post_node_expansion(self):
Expand Down
37 changes: 5 additions & 32 deletions tests/unit/bigquery/test_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,18 +680,10 @@ def test_bigquery_graph_json_result(monkeypatch):
html_content = display_mock.call_args_list[0][0][0].data
assert "<script>" in html_content
assert "</script>" in html_content
# Verify that the query results are embedded into the HTML, allowing them to be visualized.
# Due to escaping, it is not possible check for graph_json_rows exactly, so we check for a few
# sentinel strings within the query results, instead.
# Sanity check that query results are not embedded into the HTML.
assert (
"mUZpbkdyYXBoLlBlcnNvbgB4kQI=" in html_content
"mUZpbkdyYXBoLlBlcnNvbgB4kQI=" not in html_content
) # identifier in 1st row of query result
assert (
"mUZpbkdyYXBoLlBlcnNvbgB4kQY=" in html_content
) # identifier in 2nd row of query result
assert (
"mUZpbkdyYXBoLlBlcnNvbgB4kQQ=" in html_content
) # identifier in 3rd row of query result

# Make sure we can run a second graph query, after the graph server is already running.
try:
Expand All @@ -704,18 +696,6 @@ def test_bigquery_graph_json_result(monkeypatch):
html_content = display_mock.call_args_list[0][0][0].data
assert "<script>" in html_content
assert "</script>" in html_content
# Verify that the query results are embedded into the HTML, allowing them to be visualized.
# Due to escaping, it is not possible check for graph_json_rows exactly, so we check for a few
# sentinel strings within the query results, instead.
assert (
"mUZpbkdyYXBoLlBlcnNvbgB4kQI=" in html_content
) # identifier in 1st row of query result
assert (
"mUZpbkdyYXBoLlBlcnNvbgB4kQY=" in html_content
) # identifier in 2nd row of query result
assert (
"mUZpbkdyYXBoLlBlcnNvbgB4kQQ=" in html_content
) # identifier in 3rd row of query result

assert bqstorage_mock.called # BQ storage client was used
assert isinstance(return_value, pandas.DataFrame)
Expand Down Expand Up @@ -791,18 +771,10 @@ def test_bigquery_graph_colab(monkeypatch):
html_content = display_mock.call_args_list[0][0][0].data
assert "<script>" in html_content
assert "</script>" in html_content
# Verify that the query results are embedded into the HTML, allowing them to be visualized.
# Due to escaping, it is not possible check for graph_json_rows exactly, so we check for a few
# sentinel strings within the query results, instead.
# Verify that the query results are not embedded into the HTML.
assert (
"mUZpbkdyYXBoLlBlcnNvbgB4kQI=" in html_content
"mUZpbkdyYXBoLlBlcnNvbgB4kQI=" not in html_content
) # identifier in 1st row of query result
assert (
"mUZpbkdyYXBoLlBlcnNvbgB4kQY=" in html_content
) # identifier in 2nd row of query result
assert (
"mUZpbkdyYXBoLlBlcnNvbgB4kQQ=" in html_content
) # identifier in 3rd row of query result

# Make sure we actually used colab path, not GraphServer path.
assert sys.modules["google.colab"].output.register_callback.called
Expand All @@ -818,6 +790,7 @@ def test_bigquery_graph_colab(monkeypatch):
reason="Requires `spanner-graph-notebook` and `google-cloud-bigquery-storage`",
)
def test_colab_query_callback():
graph_server.graph_server.query_result = pandas.DataFrame([], columns=["result"])
result = bigquery_magics.bigquery._colab_query_callback(
"query", json.dumps({"result": {}})
)
Expand Down
11 changes: 5 additions & 6 deletions tests/unit/test_graph_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import json
import unittest

import pandas as pd
import pytest
import requests

Expand Down Expand Up @@ -471,16 +472,14 @@ def test_post_ping(self):
)
def test_post_query(self):
self.assertTrue(self.server_thread.is_alive())
graph_server.graph_server.query_result = pd.DataFrame(
[json.dumps(row_alex_owns_account)], columns=["result"]
)
route = graph_server.graph_server.build_route(
graph_server.GraphServer.endpoints["post_query"]
)

data = {
"result": {
"0": json.dumps(row_alex_owns_account),
}
}
response = requests.post(route, json={"params": json.dumps(data)})
response = requests.post(route, json={"params": "{}"})
self.assertEqual(response.status_code, 200)
response_data = response.json()["response"]

Expand Down
Loading