From 4f1dfbd12239884551d4817ff1301eb00169ce8f Mon Sep 17 00:00:00 2001 From: OlegWock Date: Fri, 7 Nov 2025 09:27:43 +0100 Subject: [PATCH 1/2] fix: Do not convert nparray into list before wrapping into pandas.Series --- deepnote_toolkit/ocelots/pandas/analyze.py | 2 +- tests/unit/test_analyze_columns_pandas.py | 97 ++++++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) diff --git a/deepnote_toolkit/ocelots/pandas/analyze.py b/deepnote_toolkit/ocelots/pandas/analyze.py index 5ee90cb..f1eed1c 100644 --- a/deepnote_toolkit/ocelots/pandas/analyze.py +++ b/deepnote_toolkit/ocelots/pandas/analyze.py @@ -24,7 +24,7 @@ def _count_unique(column): def _get_categories(np_array): - pandas_series = pd.Series(np_array.tolist()) + pandas_series = pd.Series(np_array) # special treatment for empty values num_nans = pandas_series.isna().sum().item() diff --git a/tests/unit/test_analyze_columns_pandas.py b/tests/unit/test_analyze_columns_pandas.py index e828e6d..e414232 100644 --- a/tests/unit/test_analyze_columns_pandas.py +++ b/tests/unit/test_analyze_columns_pandas.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +from trino.types import NamedRowTuple from deepnote_toolkit.ocelots.constants import DEEPNOTE_INDEX_COLUMN from deepnote_toolkit.ocelots.pandas.analyze import analyze_columns @@ -575,5 +576,101 @@ def test_mixed_column_types(self): self.assertIsNotNone(col.stats) +class TestAnalyzeColumnsWithTrinoTypes(unittest.TestCase): + def test_analyze_columns_with_named_row_tuple(self): + row1 = NamedRowTuple( + values=[1, "Alice"], + names=["id", "name"], + types=["integer", "varchar"] + ) + row2 = NamedRowTuple( + values=[2, "Bob"], + names=["id", "name"], + types=["integer", "varchar"] + ) + row3 = NamedRowTuple( + values=[1, "Alice"], + names=["id", "name"], + types=["integer", "varchar"] + ) + + np_array = np.empty(3, dtype=object) + np_array[0] = row1 + np_array[1] = row2 + np_array[2] = row3 + + df = pd.DataFrame({"col1": np_array}) + result = analyze_columns(df) + + self.assertEqual(len(result), 1) + self.assertEqual(result[0].name, "col1") + self.assertEqual(result[0].dtype, "object") + self.assertIsNotNone(result[0].stats) + self.assertIsNotNone(result[0].stats.categories) + self.assertIsInstance(result[0].stats.categories, list) + self.assertGreater(len(result[0].stats.categories), 0) + for category in result[0].stats.categories: + self.assertIn("name", category) + self.assertIn("count", category) + + def test_analyze_columns_with_named_row_tuple_and_missing_values(self): + row1 = NamedRowTuple( + values=[1, "Alice"], + names=["id", "name"], + types=["integer", "varchar"] + ) + row2 = NamedRowTuple( + values=[2, "Bob"], + names=["id", "name"], + types=["integer", "varchar"] + ) + + np_array = np.empty(4, dtype=object) + np_array[0] = row1 + np_array[1] = row2 + np_array[2] = None + np_array[3] = row1 + + df = pd.DataFrame({"col1": np_array}) + result = analyze_columns(df) + + self.assertEqual(len(result), 1) + self.assertIsNotNone(result[0].stats) + self.assertIsNotNone(result[0].stats.categories) + + category_names = [cat["name"] for cat in result[0].stats.categories] + self.assertIn("Missing", category_names) + + missing_cat = next( + cat for cat in result[0].stats.categories if cat["name"] == "Missing" + ) + self.assertEqual(missing_cat["count"], 1) + + def test_analyze_columns_with_many_named_row_tuples(self): + np_array = np.empty(20, dtype=object) + for i in range(10): + row = NamedRowTuple( + values=[i, f"User{i}"], + names=["id", "name"], + types=["integer", "varchar"] + ) + np_array[i * 2] = row + np_array[i * 2 + 1] = row + + df = pd.DataFrame({"col1": np_array}) + result = analyze_columns(df) + + self.assertEqual(len(result), 1) + self.assertIsNotNone(result[0].stats) + self.assertIsNotNone(result[0].stats.categories) + self.assertGreaterEqual(len(result[0].stats.categories), 1) + self.assertLessEqual(len(result[0].stats.categories), 3) + + has_others = any( + "others" in cat["name"] for cat in result[0].stats.categories + ) + self.assertTrue(has_others) + + if __name__ == "__main__": unittest.main() From cc011eb18d3835ca656f1c3117a623c5c93f461a Mon Sep 17 00:00:00 2001 From: OlegWock Date: Fri, 7 Nov 2025 09:30:49 +0100 Subject: [PATCH 2/2] Format --- tests/unit/test_analyze_columns_pandas.py | 26 ++++++----------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/tests/unit/test_analyze_columns_pandas.py b/tests/unit/test_analyze_columns_pandas.py index e414232..0946e8b 100644 --- a/tests/unit/test_analyze_columns_pandas.py +++ b/tests/unit/test_analyze_columns_pandas.py @@ -579,19 +579,13 @@ def test_mixed_column_types(self): class TestAnalyzeColumnsWithTrinoTypes(unittest.TestCase): def test_analyze_columns_with_named_row_tuple(self): row1 = NamedRowTuple( - values=[1, "Alice"], - names=["id", "name"], - types=["integer", "varchar"] + values=[1, "Alice"], names=["id", "name"], types=["integer", "varchar"] ) row2 = NamedRowTuple( - values=[2, "Bob"], - names=["id", "name"], - types=["integer", "varchar"] + values=[2, "Bob"], names=["id", "name"], types=["integer", "varchar"] ) row3 = NamedRowTuple( - values=[1, "Alice"], - names=["id", "name"], - types=["integer", "varchar"] + values=[1, "Alice"], names=["id", "name"], types=["integer", "varchar"] ) np_array = np.empty(3, dtype=object) @@ -615,14 +609,10 @@ def test_analyze_columns_with_named_row_tuple(self): def test_analyze_columns_with_named_row_tuple_and_missing_values(self): row1 = NamedRowTuple( - values=[1, "Alice"], - names=["id", "name"], - types=["integer", "varchar"] + values=[1, "Alice"], names=["id", "name"], types=["integer", "varchar"] ) row2 = NamedRowTuple( - values=[2, "Bob"], - names=["id", "name"], - types=["integer", "varchar"] + values=[2, "Bob"], names=["id", "name"], types=["integer", "varchar"] ) np_array = np.empty(4, dtype=object) @@ -652,7 +642,7 @@ def test_analyze_columns_with_many_named_row_tuples(self): row = NamedRowTuple( values=[i, f"User{i}"], names=["id", "name"], - types=["integer", "varchar"] + types=["integer", "varchar"], ) np_array[i * 2] = row np_array[i * 2 + 1] = row @@ -666,9 +656,7 @@ def test_analyze_columns_with_many_named_row_tuples(self): self.assertGreaterEqual(len(result[0].stats.categories), 1) self.assertLessEqual(len(result[0].stats.categories), 3) - has_others = any( - "others" in cat["name"] for cat in result[0].stats.categories - ) + has_others = any("others" in cat["name"] for cat in result[0].stats.categories) self.assertTrue(has_others)