From acac4b449b7aba19d465e1442e608289180e1e28 Mon Sep 17 00:00:00 2001 From: boringdata Date: Mon, 9 Mar 2026 14:27:52 +0000 Subject: [PATCH] feat: support self-joins and alias-based prefixing in YAML When the same model is joined multiple times (e.g., airports as both origin and destination), auto-create aliased copies with distinct table references. Also alias joins when the YAML key differs from the model name so dimension prefixes match the alias. Closes #114 Co-Authored-By: Claude Opus 4.6 --- src/boring_semantic_layer/tests/test_yaml.py | 69 ++++++++++++ src/boring_semantic_layer/yaml.py | 108 ++++++++++++++----- 2 files changed, 150 insertions(+), 27 deletions(-) diff --git a/src/boring_semantic_layer/tests/test_yaml.py b/src/boring_semantic_layer/tests/test_yaml.py index 230619d7..3c72577f 100644 --- a/src/boring_semantic_layer/tests/test_yaml.py +++ b/src/boring_semantic_layer/tests/test_yaml.py @@ -1301,3 +1301,72 @@ def test_from_config_with_filter_and_joins(): result = models["orders"].group_by("orders.oid").aggregate("orders.total").execute() assert "orders.oid" in result.columns assert len(result) == 3 + + +# --------------------------------------------------------------------------- +# Issue #114: self-joins in YAML +# --------------------------------------------------------------------------- + + +def test_yaml_self_joins(duckdb_conn): + """Test joining the same model multiple times with different aliases (#114).""" + from boring_semantic_layer import to_semantic_table + + duckdb_conn.raw_sql( + "CREATE TABLE airports_114 (code VARCHAR, city VARCHAR)" + ) + duckdb_conn.raw_sql( + "INSERT INTO airports_114 VALUES ('SFO', 'San Francisco'), " + "('JFK', 'New York'), ('LAX', 'Los Angeles')" + ) + duckdb_conn.raw_sql( + "CREATE TABLE flights_114 (origin VARCHAR, destination VARCHAR, distance INTEGER)" + ) + duckdb_conn.raw_sql( + "INSERT INTO flights_114 VALUES ('SFO', 'JFK', 2586), " + "('JFK', 'LAX', 2475), ('LAX', 'SFO', 337)" + ) + + airports_model = ( + to_semantic_table(duckdb_conn.table("airports_114"), name="airports") + .with_dimensions(code=lambda t: t.code, city=lambda t: t.city) + ) + + config = { + "flights": { + "table": "flights_114", + "dimensions": {"origin": "_.origin", "destination": "_.destination"}, + "measures": {"total_distance": "_.distance.sum()"}, + "joins": { + "origin_airport": { + "model": "airports", + "type": "one", + "left_on": "origin", + "right_on": "code", + }, + "destination_airport": { + "model": "airports", + "type": "one", + "left_on": "destination", + "right_on": "code", + }, + }, + }, + } + + models = from_config( + config, + tables={ + "flights_114": duckdb_conn.table("flights_114"), + "airports": airports_model, + }, + ) + df = ( + models["flights"] + .group_by("origin_airport.city", "destination_airport.city") + .aggregate("total_distance") + .execute() + ) + assert len(df) == 3 + assert "origin_airport.city" in df.columns + assert "destination_airport.city" in df.columns diff --git a/src/boring_semantic_layer/yaml.py b/src/boring_semantic_layer/yaml.py index 863189fc..9a893124 100644 --- a/src/boring_semantic_layer/yaml.py +++ b/src/boring_semantic_layer/yaml.py @@ -75,6 +75,69 @@ def _parse_filter(filter_expr: str) -> callable: return lambda t, d=deferred: d.resolve(t) +def _resolve_join_model( + alias: str, + join_model_name: str, + tables: Mapping[str, Any], + yaml_configs: Mapping[str, Any], + models: dict[str, SemanticModel], +) -> SemanticModel: + """Look up and return the model to join.""" + if join_model_name in models: + return models[join_model_name] + elif join_model_name in tables: + table = tables[join_model_name] + if isinstance(table, SemanticModel | SemanticTable): + return table + else: + raise TypeError( + f"Join '{alias}' references '{join_model_name}' which is not a semantic model/table" + ) + elif join_model_name in yaml_configs: + raise ValueError( + f"Model '{join_model_name}' in join '{alias}' not yet loaded. Check model order." + ) + else: + available = sorted( + list(models.keys()) + + [k for k in tables if isinstance(tables.get(k), SemanticModel | SemanticTable)] + ) + raise KeyError( + f"Model '{join_model_name}' in join '{alias}' not found. Available: {', '.join(available)}" + ) + + +def _create_aliased_model(model: SemanticModel, alias: str) -> SemanticModel: + """Create an aliased copy of a model with a different name for join prefixing. + + For self-joins (same model joined multiple times), also creates a distinct + table reference via ``.view()`` to avoid ambiguous column errors. + """ + base_table = model.op().to_untagged() + + # Create a distinct table reference for self-joins + try: + aliased_table = base_table.view() + except Exception: + aliased_table = base_table + + aliased_model = to_semantic_table(aliased_table, name=alias) + + dims = model.get_dimensions() + if dims: + aliased_model = aliased_model.with_dimensions(**dims) + + measures = model.get_measures() + if measures: + aliased_model = aliased_model.with_measures(**measures) + + calc_measures = model.get_calculated_measures() + if calc_measures: + aliased_model = aliased_model.with_measures(**calc_measures) + + return aliased_model + + def _parse_joins( joins_config: dict[str, Mapping[str, Any]], tables: Mapping[str, Any], @@ -85,39 +148,30 @@ def _parse_joins( """Parse join configuration and apply joins to a semantic model.""" result_model = models[current_model_name] + # Track which models have been joined to detect self-joins + joined_model_names: dict[str, int] = {} + # Process each join definition for alias, join_config in joins_config.items(): join_model_name = join_config.get("model") if not join_model_name: raise ValueError(f"Join '{alias}' must specify 'model' field") - # Look up the model to join - check in order: models, tables, yaml_configs - if join_model_name in models: - # Already loaded model from this YAML - join_model = models[join_model_name] - elif join_model_name in tables: - # Table passed via tables parameter - table = tables[join_model_name] - if isinstance(table, SemanticModel | SemanticTable): - join_model = table - else: - raise TypeError( - f"Join '{alias}' references '{join_model_name}' which is not a semantic model/table" - ) - elif join_model_name in yaml_configs: - # Defined in YAML but not yet loaded - wrong order - raise ValueError( - f"Model '{join_model_name}' in join '{alias}' not yet loaded. Check model order." - ) - else: - # Not found anywhere - available = sorted( - list(models.keys()) - + [k for k in tables if isinstance(tables.get(k), SemanticModel | SemanticTable)] - ) - raise KeyError( - f"Model '{join_model_name}' in join '{alias}' not found. Available: {', '.join(available)}" - ) + join_model = _resolve_join_model(alias, join_model_name, tables, yaml_configs, models) + + # Create an aliased copy when the alias differs from the model name, + # or for self-joins (same model joined multiple times). + # This ensures dimension prefixes match the YAML alias (e.g., "origin_airport.city") + # rather than the underlying model name (e.g., "airports.city"). + join_count = joined_model_names.get(join_model_name, 0) + needs_alias = ( + alias != join_model_name + or join_count > 0 + or join_model_name == current_model_name + ) + if needs_alias: + join_model = _create_aliased_model(join_model, alias) + joined_model_names[join_model_name] = join_count + 1 # Apply the join based on type join_type = join_config.get("type", "one") # Default to one-to-one