fix issue for DataFrames with mismatching keys

mplatzer · web-flow · commit 6b19d958ba3b · 2024-12-11T09:44:18.000+01:00
diff --git a/mostlyai/qa/_common.py b/mostlyai/qa/_common.py
@@ -132,12 +132,13 @@ def determine_data_size(
     tgt_context_key: str | None = None,
 ) -> int:
     if ctx_data is not None and ctx_primary_key is not None:
-        return len(ctx_data[ctx_primary_key].unique())
-    elif ctx_data is not None and not ctx_data.empty:
-        return len(ctx_data)
+        # consider number of matching keys for sample size
+        ctx_keys = ctx_data[ctx_primary_key].unique()
+        tgt_keys = tgt_data[tgt_context_key].unique()
+        keys = set(ctx_keys).intersection(set(tgt_keys))
+        return len(keys)
     elif tgt_data is not None and tgt_context_key is not None:
-        return len(tgt_data[tgt_context_key].unique())
-    elif tgt_data is not None and not tgt_data.empty:
-        return len(tgt_data)
+        tgt_keys = tgt_data[tgt_context_key].unique()
+        return len(tgt_keys)
     else:
-        return 0
+        return len(tgt_data)
diff --git a/tests/end_to_end/test_report.py b/tests/end_to_end/test_report.py
@@ -205,6 +205,8 @@ def make_dfs(
     test_dfs = [
         # setups with <100 rows in tgt/ctx should early terminate
         {"dfs": make_dfs(ctx_rows=99, tgt_rows=99, ctx_cols=["ctx_col"], tgt_cols=["tgt_col"]), "early_term": True},
+        {"dfs": make_dfs(ctx_rows=100, tgt_rows=100, shift=90, tgt_cols=["tgt_col"]), "early_term": True},
+        {"dfs": make_dfs(ctx_rows=100, tgt_rows=100, shift=100, tgt_cols=["tgt_col"]), "early_term": True},
         # other setups should produce report
         {"dfs": make_dfs(ctx_rows=100, tgt_rows=100), "early_term": False},
         {"dfs": make_dfs(ctx_rows=100, tgt_rows=100, ctx_cols=["ctx_col"], tgt_cols=["tgt_col"]), "early_term": False},
@@ -238,23 +240,6 @@ def test_report_few_holdout_records(tmp_path):
     assert metrics is not None
 
 
-def test_report_sequential_few_records(tmp_path):
-    # ensure that we don't crash in case of dominant zero-seq-length
-    ctx = pd.DataFrame({"id": list(range(1000))})
-    tgt = pd.DataFrame({"id": [1, 2, 3, 4, 5] * 100, "col": ["a"] * 500})
-    _, metrics = qa.report(
-        syn_tgt_data=tgt,
-        trn_tgt_data=tgt,
-        hol_tgt_data=tgt,
-        syn_ctx_data=ctx,
-        trn_ctx_data=ctx,
-        hol_ctx_data=ctx,
-        tgt_context_key="id",
-        ctx_primary_key="id",
-    )
-    assert metrics is not None
-
-
 def test_odd_column_names(tmp_path):
     values = ["a", "b"] * 50
     df = pd.DataFrame(