From 62ac45f278cd516e0e9e54968cd4b250014c19cc Mon Sep 17 00:00:00 2001 From: Eric Pan Date: Thu, 14 Aug 2025 14:29:58 -0700 Subject: [PATCH 1/5] feat(table): add extract, from_path, and join methods for advanced data operations --- chidian/table.py | 396 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 396 insertions(+) diff --git a/chidian/table.py b/chidian/table.py index 2a94d56..3f298fa 100644 --- a/chidian/table.py +++ b/chidian/table.py @@ -335,6 +335,402 @@ def group_by(self, path: str) -> dict[Any, "Table"]: return {key: Table(rows) for key, rows in groups.items()} + def extract(self, path: str) -> "Table": + """ + Extract values from all rows using a path and return as a new Table. + + This method is particularly useful for extracting nested structures + like FHIR Bundle entries or other collections within your data. + + Args: + path: Path expression to extract from each row + - Supports wildcards: "entries[*].resource" + - Supports nested paths: "patient.address[0].city" + + Returns: + New Table where each extracted value becomes a row. + If path uses wildcards and returns lists, the lists are flattened. + None values are filtered out. + + Examples: + >>> # FHIR Bundle example + >>> bundle_table = Table([fhir_bundle]) + >>> resources = bundle_table.extract("entries[*].resource") + + >>> # Extract nested values from multiple rows + >>> patients_table = Table([patient1, patient2, patient3]) + >>> addresses = patients_table.extract("address[*]") + + >>> # Simple field extraction + >>> names = patients_table.extract("name.given[0]") + """ + # Get extracted values using existing logic + extracted = self.get(path) + + # Handle the case where get() returns a single value (shouldn't happen for non-$ paths, but be safe) + if not isinstance(extracted, list): + extracted = [extracted] + + # Flatten any nested lists and filter out None values + flattened = [] + for item in extracted: + if item is None: + continue + elif isinstance(item, list): + # Flatten lists from wildcard extractions + flattened.extend(item) + else: + flattened.append(item) + + # Return new Table with extracted values as rows + return Table(flattened) + + @classmethod + def from_path(cls, data: Any, path: str) -> "Table": + """ + Create a Table by extracting a path from source data. + + This is a convenience constructor that's perfect for extracting + collections from complex nested structures like FHIR Bundles. + + Args: + data: Source data structure (dict, list, or any nested structure) + path: Path expression to extract + - "entries[*].resource" for FHIR Bundle resources + - "results[*].observation" for lab results + - "items[*]" for simple list extraction + + Returns: + New Table with extracted data as rows + + Examples: + >>> # Extract FHIR Bundle resources directly + >>> resources_table = Table.from_path(fhir_bundle, "entries[*].resource") + + >>> # Extract nested arrays + >>> observations = Table.from_path(lab_report, "results[*].observation") + + >>> # Extract from lists + >>> items = Table.from_path({"data": [item1, item2]}, "data[*]") + """ + # Create temporary single-row table with the source data + temp_table = cls([data]) + + # Extract the path and return the result + return temp_table.extract(f"$0.{path}") + + def join( + self, + other: "Table", + on: str | tuple[str, str] | list[str | tuple[str, str]] | None = None, + how: str = "left", + suffixes: tuple[str, str] = ("", "_2"), + ) -> "Table": + """ + Join two tables based on matching column values. + + Supports SQL-like join operations with flexible key specification + and path-based column access. + + Args: + other: The right table to join with + on: Join key specification: + - str: Same column name in both tables ("id") + - tuple: Different names (("id", "patient_id")) + - list[str]: Multiple columns, same names (["id", "type"]) + - list[tuple]: Multiple columns, different names + ([("patient_id", "subject_id"), ("date", "visit_date")]) + - None: Natural join on all common columns + how: Join type - "left" (default), "inner", "right", or "outer" + suffixes: Tuple of suffixes for conflicting column names. + Default ("", "_2") adds "_2" to right table conflicts. + + Returns: + New Table with joined data + + Examples: + >>> # Simple join on same column + >>> patients.join(visits, on="patient_id") + + >>> # Join with different column names + >>> patients.join(visits, on=("id", "patient_id")) + + >>> # Multiple join keys + >>> orders.join(items, on=[("order_id", "oid"), "date"]) + + >>> # Inner join with path expression + >>> patients.join(observations, + ... on=("id", "subject.reference"), + ... how="inner") + """ + # Parse the 'on' parameter to get join column specifications + left_keys, right_keys = self._parse_join_keys(on, other) + + # Build lookup dictionary from right table for efficient joining + right_lookup = self._build_join_lookup(other, right_keys) + + # Perform the join based on the specified type + if how == "left": + return self._left_join(left_keys, right_lookup, suffixes) + elif how == "inner": + return self._inner_join(left_keys, right_lookup, suffixes) + elif how == "right": + return self._right_join( + other, left_keys, right_keys, right_lookup, suffixes + ) + elif how == "outer": + return self._outer_join(left_keys, right_lookup, suffixes) + else: + raise ValueError( + f"Invalid join type: {how}. Must be 'left', 'inner', 'right', or 'outer'" + ) + + def _parse_join_keys( + self, + on: str | tuple[str, str] | list[str | tuple[str, str]] | None, + other: "Table", + ) -> tuple[list[str], list[str]]: + """Parse the 'on' parameter to extract left and right join keys.""" + if on is None: + # Natural join - find common columns + common = self.columns & other.columns + if not common: + raise ValueError("No common columns found for natural join") + left_keys = list(common) + right_keys = list(common) + elif isinstance(on, str): + # Single column, same name + left_keys = [on] + right_keys = [on] + elif isinstance(on, tuple) and len(on) == 2: + # Single column, different names + left_keys = [on[0]] + right_keys = [on[1]] + elif isinstance(on, list): + left_keys = [] + right_keys = [] + for item in on: + if isinstance(item, str): + # Same column name + left_keys.append(item) + right_keys.append(item) + elif isinstance(item, tuple) and len(item) == 2: + # Different column names + left_keys.append(item[0]) + right_keys.append(item[1]) + else: + raise ValueError(f"Invalid join key specification: {item}") + else: + raise ValueError(f"Invalid 'on' parameter: {on}") + + return left_keys, right_keys + + def _build_join_lookup( + self, table: "Table", keys: list[str] + ) -> dict[tuple, list[dict[str, Any]]]: + """Build a lookup dictionary for efficient joining.""" + lookup: dict[tuple, list[dict[str, Any]]] = {} + + for row in table._rows: + # Extract key values using get() to support paths + key_values = [] + for key in keys: + value = get(row, key, default=None) + # Convert unhashable types to strings for lookup + try: + hash(value) + key_values.append(value) + except TypeError: + key_values.append(str(value)) + + key_tuple = tuple(key_values) + if key_tuple not in lookup: + lookup[key_tuple] = [] + lookup[key_tuple].append(row) + + return lookup + + def _merge_rows( + self, + left_row: dict[str, Any], + right_row: dict[str, Any] | None, + suffixes: tuple[str, str], + join_keys: set[str] | None = None, + ) -> dict[str, Any]: + """Merge two rows, handling column conflicts with suffixes.""" + if right_row is None: + return left_row.copy() + + merged = left_row.copy() + left_suffix, right_suffix = suffixes + join_keys = join_keys or set() + + for key, value in right_row.items(): + if key in merged: + # Check if this is a join key - if so, don't apply suffixes + if key in join_keys: + # Join key - keep as-is (left value takes precedence) + continue + + # Column conflict - apply suffixes + if left_suffix: + # Rename left column + merged[key + left_suffix] = merged[key] + del merged[key] + + if right_suffix: + merged[key + right_suffix] = value + elif not left_suffix: + # No suffixes - right overwrites left + merged[key] = value + else: + # No conflict + merged[key] = value + + return merged + + def _left_join( + self, + left_keys: list[str], + right_lookup: dict[tuple, list[dict[str, Any]]], + suffixes: tuple[str, str], + ) -> "Table": + """Perform a left outer join.""" + result_rows = [] + + for left_row in self._rows: + # Extract key values from left row + key_values = [] + for key in left_keys: + value = get(left_row, key, default=None) + try: + hash(value) + key_values.append(value) + except TypeError: + key_values.append(str(value)) + + key_tuple = tuple(key_values) + matching_rows = right_lookup.get(key_tuple, [None]) + + # Create a result row for each match (or one with None if no matches) + for right_row in matching_rows: + result_rows.append( + self._merge_rows(left_row, right_row, suffixes, set(left_keys)) + ) + + return Table(result_rows) + + def _inner_join( + self, + left_keys: list[str], + right_lookup: dict[tuple, list[dict[str, Any]]], + suffixes: tuple[str, str], + ) -> "Table": + """Perform an inner join.""" + result_rows = [] + + for left_row in self._rows: + # Extract key values from left row + key_values = [] + for key in left_keys: + value = get(left_row, key, default=None) + try: + hash(value) + key_values.append(value) + except TypeError: + key_values.append(str(value)) + + key_tuple = tuple(key_values) + matching_rows = right_lookup.get(key_tuple, []) + + # Only add rows when there are matches + for right_row in matching_rows: + result_rows.append( + self._merge_rows(left_row, right_row, suffixes, set(left_keys)) + ) + + return Table(result_rows) + + def _right_join( + self, + other: "Table", + left_keys: list[str], + right_keys: list[str], + right_lookup: dict[tuple, list[dict[str, Any]]], + suffixes: tuple[str, str], + ) -> "Table": + """Perform a right outer join.""" + # Build lookup for left table + left_lookup = self._build_join_lookup(self, left_keys) + + result_rows = [] + for right_row in other._rows: + # Extract key values from right row + key_values = [] + for key in right_keys: + value = get(right_row, key, default=None) + try: + hash(value) + key_values.append(value) + except TypeError: + key_values.append(str(value)) + + key_tuple = tuple(key_values) + matching_rows = left_lookup.get(key_tuple, [None]) + + # Create a result row for each match (or one with None if no matches) + for left_row in matching_rows: + if left_row is None: + # No match - just use right row + result_rows.append(right_row.copy()) + else: + result_rows.append( + self._merge_rows(left_row, right_row, suffixes, set(left_keys)) + ) + + return Table(result_rows) + + def _outer_join( + self, + left_keys: list[str], + right_lookup: dict[tuple, list[dict[str, Any]]], + suffixes: tuple[str, str], + ) -> "Table": + """Perform a full outer join.""" + result_rows = [] + seen_right_keys = set() + + # Process all left rows + for left_row in self._rows: + # Extract key values from left row + key_values = [] + for key in left_keys: + value = get(left_row, key, default=None) + try: + hash(value) + key_values.append(value) + except TypeError: + key_values.append(str(value)) + + key_tuple = tuple(key_values) + matching_rows = right_lookup.get(key_tuple, [None]) + + if matching_rows != [None]: + seen_right_keys.add(key_tuple) + + for right_row in matching_rows: + result_rows.append( + self._merge_rows(left_row, right_row, suffixes, set(left_keys)) + ) + + # Add unmatched right rows + for key_tuple, right_rows in right_lookup.items(): + if key_tuple not in seen_right_keys: + for right_row in right_rows: + result_rows.append(right_row.copy()) + + return Table(result_rows) + def head(self, n: int = 5) -> "Table": """ Return first n rows. From d90671bd400f3792f38d87a6aeddcaca4f364aab Mon Sep 17 00:00:00 2001 From: Eric Pan Date: Thu, 14 Aug 2025 14:30:47 -0700 Subject: [PATCH 2/5] chore(gitignore): expand with comprehensive ignore rules - Expand .gitignore with comprehensive rules --- .gitignore | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/.gitignore b/.gitignore index 22b6bab..a6837ea 100644 --- a/.gitignore +++ b/.gitignore @@ -97,3 +97,36 @@ docs/_build/ # Pyenv .python-version + +# BEGIN roobert +# Roobert +.roobert/ +.DS_Store +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +.venv/ +venv/ +.env +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +# END roobert From 2d90ed11c343973dc67e5f5e6f58bb8947639945 Mon Sep 17 00:00:00 2001 From: Eric Pan Date: Thu, 14 Aug 2025 15:01:44 -0700 Subject: [PATCH 3/5] feat(table): add dot notation access for rows and columns - Add enhanced dict access for Table class - Update tests to use new access pattern --- chidian/table.py | 32 ++++++++++++++++++++++++++++++++ tests/test_table.py | 26 +++++++++++++------------- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/chidian/table.py b/chidian/table.py index 3f298fa..eb32196 100644 --- a/chidian/table.py +++ b/chidian/table.py @@ -731,6 +731,38 @@ def _outer_join( return Table(result_rows) + def __getitem__(self, key: str) -> Any: + """ + Enhanced dict access with dot syntax support. + + Supports both traditional dict access and path-based access: + - table["$0"] → returns the row dict (traditional) + - table["$0.name"] → extracts value using path syntax + - table["column"] → extracts column values from all rows (same as get()) + + Args: + key: Either a row key ("$0") or a path expression ("$0.name", "column") + + Returns: + For row keys: the row dict + For path expressions: the extracted value(s) + + Examples: + >>> table["$0"] # Get entire row + {"name": "John", "age": 30} + >>> table["$0.name"] # Get specific field from row + "John" + >>> table["name"] # Get column from all rows + ["John", "Jane", "Bob"] + """ + # Check if this is a path expression (contains dot) + if "." in key: + # Use the get() method which handles path syntax + return self.get(key) + + # Traditional dict access - delegate to parent + return super().__getitem__(key) + def head(self, n: int = 5) -> "Table": """ Return first n rows. diff --git a/tests/test_table.py b/tests/test_table.py index 0740498..a9d26d0 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -29,8 +29,8 @@ def test_dict_indexing(): ) # Test basic dict access - assert table["$0"]["name"] == "John" - assert table["$1"]["name"] == "Jane" + assert table["$0.name"] == "John" + assert table["$1.name"] == "Jane" # Test dict.get() method (inherited) assert dict.get(table, "$0")["name"] == "John" @@ -119,15 +119,15 @@ def test_filter_method(): assert "$0" in active_table assert "$1" in active_table assert "$2" in active_table - assert active_table["$0"]["name"] == "John" - assert active_table["$1"]["name"] == "Bob" - assert active_table["$2"]["name"] == "Alice" + assert active_table["$0.name"] == "John" + assert active_table["$1.name"] == "Bob" + assert active_table["$2.name"] == "Alice" # Filter by age young_table = table.filter(lambda x: x.get("age", 0) < 30) assert len(young_table) == 2 - assert list(young_table)[0]["name"] == "Jane" - assert list(young_table)[1]["name"] == "Alice" + assert young_table["$0.name"] == "Jane" + assert young_table["$1.name"] == "Alice" def test_map_method(): @@ -178,20 +178,20 @@ def test_append_method(): # Append with auto-generated key table.append({"name": "John"}) assert len(table) == 1 - assert table["$0"]["name"] == "John" + assert table["$0.name"] == "John" # Append with specific key (should get $ prefix) table.append({"name": "Jane"}, custom_key="jane_key") - assert table["$jane_key"]["name"] == "Jane" + assert table["$jane_key.name"] == "Jane" assert len(table) == 2 # Append another auto-keyed row table.append({"name": "Bob"}) - assert table["$2"]["name"] == "Bob" + assert table["$2.name"] == "Bob" assert len(table) == 3 # Test accessing named row with dict access - assert table["$jane_key"]["name"] == "Jane" + assert table["$jane_key.name"] == "Jane" def test_unique_method(): @@ -417,8 +417,8 @@ def test_init_with_dict(): assert len(table) == 2 assert "$user1" in table assert "$user2" in table - assert table["$user1"]["name"] == "John" - assert table["$user2"]["name"] == "Jane" + assert table["$user1.name"] == "John" + assert table["$user2.name"] == "Jane" def test_empty_table(): From 1ee847cd7a4a588dd1f628f891530eef9a15a768 Mon Sep 17 00:00:00 2001 From: Eric Pan Date: Thu, 14 Aug 2025 15:03:56 -0700 Subject: [PATCH 4/5] chore: bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4d63985..583dc6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "chidian" -version = "0.1.1" +version = "0.1.2" requires-python = ">=3.8" classifiers = [ "Programming Language :: Python :: 3", From 78256cdb4afce33fe1cf57ffb8c9cd73a23fdb27 Mon Sep 17 00:00:00 2001 From: Eric Pan Date: Thu, 14 Aug 2025 15:16:56 -0700 Subject: [PATCH 5/5] refactor(table): remove dict inheritance and use composition - Refactor Table class to remove dict inheritance - Improve row key management - Modify test cases for new implementation - Bump package version to 0.1.2 --- chidian/table.py | 47 +++++++++++++++++++++++++++++---------------- tests/test_table.py | 8 ++++---- uv.lock | 2 +- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/chidian/table.py b/chidian/table.py index eb32196..dfe579b 100644 --- a/chidian/table.py +++ b/chidian/table.py @@ -12,11 +12,10 @@ """ -class Table(dict): +class Table: def __init__( self, rows: Union[list[dict[str, Any]], dict[str, dict[str, Any]], None] = None, - **kwargs, ): """ Initialize a Table from rows. @@ -26,11 +25,10 @@ def __init__( - list[dict]: Each dict is a row, auto-keyed by index ($0, $1, ...) - dict[str, dict]: Pre-keyed rows (keys preserved) - None: Empty table - **kwargs: Additional dict initialization parameters """ - super().__init__(**kwargs) self._rows: list[dict[str, Any]] = [] self._row_keys: dict[str, int] = {} # Maps row keys to indices + self._key_to_row: dict[str, dict[str, Any]] = {} # Maps $ keys to row dicts # Initialize rows based on input type if rows is not None: @@ -39,7 +37,7 @@ def __init__( # Store rows by index using $-syntax for i, row in enumerate(rows): key = f"${i}" - self[key] = row + self._key_to_row[key] = row self._row_keys[key] = i elif isinstance(rows, dict): self._rows = list(rows.values()) @@ -48,7 +46,7 @@ def __init__( # Ensure keys start with $ for consistency if not key.startswith("$"): key = f"${key}" - self[key] = row + self._key_to_row[key] = row self._row_keys[key] = i def get(self, path: str, default: Any = None) -> Union[Any, list[Any]]: @@ -95,11 +93,11 @@ def get(self, path: str, default: Any = None) -> Union[Any, list[Any]]: row_key = parts[0] # Check if this key exists - if row_key not in self: + if row_key not in self._key_to_row: return default # Get the specific row - row = self[row_key] + row = self._key_to_row[row_key] # If there's a remaining path, extract from the row if len(parts) > 1: @@ -141,7 +139,7 @@ def to_list(self) -> list[dict[str, Any]]: def to_dict(self) -> dict[str, dict[str, Any]]: """Return rows as a dict keyed by row identifiers.""" - return dict(self) + return self._key_to_row.copy() def append(self, row: dict[str, Any], custom_key: Optional[str] = None) -> None: """ @@ -174,7 +172,7 @@ def append(self, row: dict[str, Any], custom_key: Optional[str] = None) -> None: else: key = custom_key - self[key] = row + self._key_to_row[key] = row self._row_keys[key] = len(self._rows) - 1 def filter(self, predicate: Union[str, Callable[[dict], bool]]) -> "Table": @@ -733,10 +731,10 @@ def _outer_join( def __getitem__(self, key: str) -> Any: """ - Enhanced dict access with dot syntax support. + Enhanced access with dot syntax support. - Supports both traditional dict access and path-based access: - - table["$0"] → returns the row dict (traditional) + Supports both row access and path-based access: + - table["$0"] → returns the row dict - table["$0.name"] → extracts value using path syntax - table["column"] → extracts column values from all rows (same as get()) @@ -755,13 +753,28 @@ def __getitem__(self, key: str) -> Any: >>> table["name"] # Get column from all rows ["John", "Jane", "Bob"] """ - # Check if this is a path expression (contains dot) - if "." in key: + # Check if this is a path expression (contains dot) or column access + if "." in key or not key.startswith("$"): # Use the get() method which handles path syntax return self.get(key) - # Traditional dict access - delegate to parent - return super().__getitem__(key) + # Row key access + if key in self._key_to_row: + return self._key_to_row[key] + else: + raise KeyError(key) + + def __contains__(self, key: str) -> bool: + """Check if a row key exists in the table.""" + return key in self._key_to_row + + def __setitem__(self, key: str, value: dict[str, Any]) -> None: + """Set a row by key (mainly for internal use).""" + if not key.startswith("$"): + raise ValueError("Row keys must start with '$'") + self._key_to_row[key] = value + # Note: This doesn't update _rows or _row_keys for simplicity + # Main usage should be through append() method def head(self, n: int = 5) -> "Table": """ diff --git a/tests/test_table.py b/tests/test_table.py index a9d26d0..7171bb0 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -32,10 +32,10 @@ def test_dict_indexing(): assert table["$0.name"] == "John" assert table["$1.name"] == "Jane" - # Test dict.get() method (inherited) - assert dict.get(table, "$0")["name"] == "John" - assert dict.get(table, "$nonexistent") is None - assert dict.get(table, "$nonexistent", "default") == "default" + # Test __contains__ method + assert "$0" in table + assert "$1" in table + assert "$nonexistent" not in table # Test with custom keys table.append({"name": "Bob", "age": 35}, custom_key="bob") diff --git a/uv.lock b/uv.lock index c015cd9..0c3ec6b 100644 --- a/uv.lock +++ b/uv.lock @@ -28,7 +28,7 @@ wheels = [ [[package]] name = "chidian" -version = "0.1.1" +version = "0.1.2" source = { editable = "." } dependencies = [ { name = "parsimonious" },