From 86b2eb2517319a6c153c0ba48ed49d94683b3b28 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Thu, 19 Feb 2026 15:24:02 +0000 Subject: [PATCH 01/29] feat: add summary generation and export functionality for results - Add _extract_table_info() helper method to extract variables and record counts from table outputs - Add _extract_regression_info() helper method to extract observation counts from regression outputs - Add _mark_diff_risk() helper method to identify outputs with differencing risk - Add write_summary() method to export summary to CSV - Enhance finalise_excel() to include summary sheet as first sheet - Add comprehensive test coverage for all helper methods and edge cases --- acro/record.py | 177 +++++++++++++++++++++++++++++++++ test/test_initial.py | 231 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 408 insertions(+) diff --git a/acro/record.py b/acro/record.py index aa29a5de..94e00ae4 100644 --- a/acro/record.py +++ b/acro/record.py @@ -430,6 +430,174 @@ def validate_outputs(self) -> None: ) record.exception = input("") + def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int]: + """Extract variables and total records from table output. + + Parameters + ---------- + output : list + The output to extract information from. + method : str + The method used to generate the output. + + Returns + ------- + tuple[list[str], int] + Variables and total record count. + """ + variables: list[str] = [] + total_records: int = 0 + + for table in output: + if isinstance(table, DataFrame): + # Extract index names + if hasattr(table.index, "names") and any(table.index.names): + variables.extend(str(n) for n in table.index.names if n is not None) + elif table.index.name: + variables.append(str(table.index.name)) + + # Extract column names + if hasattr(table.columns, "names") and any(table.columns.names): + variables.extend( + str(n) for n in table.columns.names if n is not None + ) + elif table.columns.name: + variables.append(str(table.columns.name)) + + # Calculate total records + try: + total_records += int(table.shape[0] * table.shape[1]) + if method == "crosstab": + cell_sum = table.values[~pd.isna(table.values)].sum() + if cell_sum > 0: + total_records = int(cell_sum) + except (TypeError, ValueError): + pass + + return variables, total_records + + def _extract_regression_info(self, output: list) -> tuple[list[str], int]: + """Extract variables and total records from regression output. + + Parameters + ---------- + output : list + The output to extract information from. + + Returns + ------- + tuple[list[str], int] + Variables and total record count. + """ + variables: list[str] = [] + total_records: int = 0 + + for table in output: + if isinstance(table, DataFrame): + for idx in table.index: + idx_str = str(idx) + if "no. observations" in idx_str.lower(): + try: + val = table.loc[idx].dropna().iloc[0] + total_records = int(float(val)) + except (ValueError, TypeError, IndexError): + pass + break + + return variables, total_records + + def _mark_diff_risk(self, summary_df: DataFrame) -> DataFrame: + """Mark outputs with differencing risk. + + Parameters + ---------- + summary_df : DataFrame + The summary DataFrame to update. + + Returns + ------- + DataFrame + Updated summary DataFrame with diff_risk column. + """ + if summary_df.empty: + summary_df["diff_risk"] = pd.Series(dtype=bool) + else: + summary_df["diff_risk"] = False + table_mask = summary_df["type"] == "table" + table_outputs = summary_df.loc[table_mask] + if not table_outputs.empty: + for _, group in table_outputs.groupby("variables"): + if len(group) > 1: + counts = group["total_records"].unique() + if len(counts) > 1: + summary_df.loc[group.index, "diff_risk"] = True + + return summary_df + + def generate_summary(self) -> DataFrame: + """Generate a summary DataFrame of all outputs in the session. + + Provides output checkers with a high-level overview of all outputs, + including what method was used, what variables are involved, the + total record count, and whether there is a differencing risk. + + Returns + ------- + DataFrame + Summary of all outputs with columns: id, method, status, type, + command, summary, variables, total_records, timestamp, diff_risk. + """ + rows = [] + for uid, rec in self.results.items(): + method = rec.properties.get("method", rec.output_type) + variables: list[str] = [] + total_records: int = 0 + + if rec.output_type == "table": + variables, total_records = self._extract_table_info(rec.output, method) + elif rec.output_type == "regression": + variables, total_records = self._extract_regression_info(rec.output) + dof = rec.properties.get("dof") + if dof is not None: + variables.append(f"dof={dof}") + + variables_str = "; ".join(variables) if variables else "" + + rows.append( + { + "id": uid, + "method": method, + "status": rec.status, + "type": rec.output_type, + "command": rec.command, + "summary": rec.summary, + "variables": variables_str, + "total_records": total_records, + "timestamp": rec.timestamp, + } + ) + + summary_df = DataFrame(rows) + summary_df = self._mark_diff_risk(summary_df) + + return summary_df + + def write_summary(self, path: str) -> None: + """Write the session summary to a CSV file. + + Parameters + ---------- + path : str + Name of the output folder. + """ + summary_df = self.generate_summary() + if summary_df.empty: + return + os.makedirs(path, exist_ok=True) + filename = os.path.normpath(f"{path}/summary.csv") + summary_df.to_csv(filename, index=False) + logger.info("summary written to: %s", filename) + def finalise(self, path: str, ext: str, interactive: bool = False) -> None: """Create a results file for checking. @@ -451,6 +619,7 @@ def finalise(self, path: str, ext: str, interactive: bool = False) -> None: self.finalise_excel(path) else: raise ValueError("Invalid file extension. Options: {json, xlsx}") + self.write_summary(path) self.write_checksums(path) # check if the directory acro_artifacts exists and delete it if os.path.exists("acro_artifacts"): @@ -513,6 +682,13 @@ def finalise_excel(self, path: str) -> None: with pd.ExcelWriter( # pylint: disable=abstract-class-instantiated filename, engine="openpyxl" ) as writer: + # summary sheet + summary_df = self.generate_summary() + if not summary_df.empty: + summary_df.to_excel( + writer, sheet_name="summary", index=False, startrow=0 + ) + # description sheet sheet = [] summary = [] @@ -527,6 +703,7 @@ def finalise_excel(self, path: str) -> None: {"Sheet": sheet, "Command": command, "Summary": summary} ) tmp_df.to_excel(writer, sheet_name="description", index=False, startrow=0) + # individual sheets for output_id, output in self.results.items(): if output.output_type == "custom": diff --git a/test/test_initial.py b/test/test_initial.py index 42e50c7b..591c8fde 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1250,3 +1250,234 @@ def test_toggle_suppression(): assert acro.suppress acro.disable_suppression() assert not acro.suppress + + +def test_summary_csv_created_with_json(data, acro): + """Test that summary.csv is created when finalising to JSON (Issue #224).""" + _ = acro.crosstab(data.year, data.grant_type) + acro.add_exception("output_0", "Let me have it") + acro.finalise(PATH, "json") + summary_path = os.path.normpath(f"{PATH}/summary.csv") + assert os.path.exists(summary_path), "summary.csv was not created" + summary_df = pd.read_csv(summary_path) + assert len(summary_df) == 1 + assert "id" in summary_df.columns + assert "method" in summary_df.columns + assert "variables" in summary_df.columns + assert "total_records" in summary_df.columns + assert "diff_risk" in summary_df.columns + # check values + assert summary_df.iloc[0]["id"] == "output_0" + assert summary_df.iloc[0]["method"] == "crosstab" + assert summary_df.iloc[0]["total_records"] > 0 + shutil.rmtree(PATH) + + +def test_summary_sheet_in_excel(data, acro): + """Test that a 'summary' sheet is created in results.xlsx (Issue #224).""" + _ = acro.crosstab(data.year, data.grant_type) + _ = acro.pivot_table( + data, index=["grant_type"], values=["inc_grants"], aggfunc=["mean", "std"] + ) + acro.add_exception("output_0", "Let me have it") + acro.add_exception("output_1", "I need this") + acro.finalise(PATH, "xlsx") + filename = os.path.normpath(f"{PATH}/results.xlsx") + xl = pd.ExcelFile(filename) + assert "summary" in xl.sheet_names, "'summary' sheet not found in Excel" + summary_df = pd.read_excel(filename, sheet_name="summary") + xl.close() + assert len(summary_df) == 2 + methods = summary_df["method"].tolist() + assert "crosstab" in methods + assert "pivot_table" in methods + shutil.rmtree(PATH) + + +def test_summary_variables_extracted(data): + """Test that variable names are correctly extracted into the summary.""" + acro_obj = ACRO(suppress=False) + _ = acro_obj.crosstab(data.year, data.grant_type) + acro_obj.add_exception("output_0", "Let me have it") + acro_obj.finalise(PATH, "json") + summary_path = os.path.normpath(f"{PATH}/summary.csv") + summary_df = pd.read_csv(summary_path) + variables = summary_df.iloc[0]["variables"] + assert "year" in variables + assert "grant_type" in variables + shutil.rmtree(PATH) + + +def test_summary_differencing_risk(data): + """Test that differencing risk is flagged when tables share variables but have different record counts (Issue #224).""" + acro_obj = ACRO(suppress=True) + _ = acro_obj.crosstab(data.year, data.grant_type) + data_subset = data.iloc[10:].copy() + _ = acro_obj.crosstab(data_subset.year, data_subset.grant_type) + acro_obj.add_exception("output_0", "Let me have it") + acro_obj.add_exception("output_1", "I need this") + + summary_df = acro_obj.results.generate_summary() + + assert summary_df.iloc[0]["variables"] == summary_df.iloc[1]["variables"] + + assert summary_df.iloc[0]["total_records"] != summary_df.iloc[1]["total_records"] + + assert summary_df.iloc[0]["diff_risk"] == True + assert summary_df.iloc[1]["diff_risk"] == True + + +def test_summary_regression_metadata(data, acro): + """Test that regression outputs are correctly captured in the summary.""" + new_df = data[["inc_activity", "inc_grants", "inc_donations", "total_costs"]] + new_df = new_df.dropna() + endog = new_df.inc_activity + exog = new_df[["inc_grants", "inc_donations", "total_costs"]] + exog = add_constant(exog) + _ = acro.ols(endog, exog) + summary_df = acro.results.generate_summary() + assert len(summary_df) == 1 + row = summary_df.iloc[0] + assert row["method"] == "ols" + assert row["type"] == "regression" + assert row["total_records"] > 0 + assert "dof=" in row["variables"] + + +def test_summary_empty_session(): + """Test that summary handles an empty session gracefully.""" + acro_obj = ACRO(suppress=False) + summary_df = acro_obj.results.generate_summary() + assert summary_df.empty + assert "diff_risk" in summary_df.columns + + +def test_extract_table_info(data, acro): + """Test the _extract_table_info helper method.""" + acro.crosstab(data.year, data.grant_type) + output = acro.results.get_index(0) + + # Test extracting info from table output with crosstab method + variables, total_records = acro.results._extract_table_info( + output.output, "crosstab" + ) + assert len(variables) > 0 + assert total_records > 0 + assert "year" in variables or "grant_type" in variables + + # Also test with non-crosstab method to ensure both paths work + variables2, total_records2 = acro.results._extract_table_info( + output.output, "pivot_table" + ) + assert len(variables2) > 0 + assert total_records2 > 0 + + +def test_extract_regression_info(): + """Test the _extract_regression_info helper method with sample data.""" + # Create a simple regression output for testing + records = Records() + + # Create sample regression output + reg_output = pd.DataFrame( + { + "coef": [1.0, 2.0], + "std err": [0.1, 0.2], + "t": [10.0, 10.0], + "P>|t|": [0.0, 0.0], + }, + index=["const", "x1"], + ) + + # Create a second table with "no. observations" + obs_output = pd.DataFrame({"Value": [100]}, index=["no. observations"]) + + output = [reg_output, obs_output] + variables, total_records = records._extract_regression_info(output) + + # Should have extracted the observations + assert isinstance(variables, list) + assert total_records == 100 + + +def test_mark_diff_risk(data, acro): + """Test the _mark_diff_risk helper method.""" + # Create two crosstabs with same variables but different data + acro.crosstab(data.year, data.grant_type) + acro.crosstab(data.year, data.grant_type) + + summary_df = acro.results.generate_summary() + + # Check that diff_risk column exists + assert "diff_risk" in summary_df.columns + # Both outputs should have diff_risk marked + assert all(isinstance(x, (bool, type(pd.NA))) for x in summary_df["diff_risk"]) + + +def test_extract_table_info_with_zero_cell_sum(): + """Test _extract_table_info when cell_sum equals 0.""" + records = Records() + + # Create a table with all NaN values to trigger cell_sum = 0 path + table = pd.DataFrame( + [[np.nan, np.nan], [np.nan, np.nan]], + index=["row1", "row2"], + columns=["col1", "col2"], + ) + table.index.name = "idx" + table.columns.name = "cols" + + output = [table] + variables, total_records = records._extract_table_info(output, "crosstab") + + # Should extract variables and use shape-based count when cell_sum is 0 + assert "idx" in variables + assert "cols" in variables + assert total_records > 0 + + +def test_extract_table_info_exception_handling(): + """Test _extract_table_info exception handling with invalid table.""" + records = Records() + + # Create a table that will cause TypeError during shape multiplication + class InvalidTable: + def __init__(self): + self.shape = "invalid" # Non-integer shape to trigger TypeError + + output = [InvalidTable()] + variables, total_records = records._extract_table_info(output, "crosstab") + + # Should handle exception gracefully + assert isinstance(variables, list) + assert isinstance(total_records, int) + + +def test_extract_regression_info_with_missing_observations(): + """Test _extract_regression_info when observation value is NaN.""" + records = Records() + + # Create regression output with no. observations but NaN value + obs_output = pd.DataFrame({"Value": [np.nan]}, index=["no. observations"]) + + output = [obs_output] + variables, total_records = records._extract_regression_info(output) + + # Should handle NaN gracefully + assert isinstance(variables, list) + assert total_records == 0 + + +def test_extract_regression_info_with_invalid_value(): + """Test _extract_regression_info when observation value is non-numeric.""" + records = Records() + + # Create regression output with invalid observation value + obs_output = pd.DataFrame({"Value": ["not_a_number"]}, index=["no. observations"]) + + output = [obs_output] + variables, total_records = records._extract_regression_info(output) + + # Should handle ValueError gracefully + assert isinstance(variables, list) + assert total_records == 0 From ea0a63ed6f33e7500526576673fbd2a6fdbb760d Mon Sep 17 00:00:00 2001 From: JessUWE Date: Thu, 19 Feb 2026 15:38:21 +0000 Subject: [PATCH 02/29] test: add coverage tests for table info extraction --- test/test_initial.py | 64 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/test/test_initial.py b/test/test_initial.py index 591c8fde..da36993a 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1481,3 +1481,67 @@ def test_extract_regression_info_with_invalid_value(): # Should handle ValueError gracefully assert isinstance(variables, list) assert total_records == 0 + + +def test_extract_table_info_with_numeric_data(): + """Test _extract_table_info with numeric data to cover line 456 and 465.""" + records = Records() + + # Create a table with numeric values (not NaN) + table = pd.DataFrame( + [[10, 20], [30, 40]], + index=["row1", "row2"], + columns=["col1", "col2"], + ) + table.index.name = "idx" + table.columns.name = "cols" + + output = [table] + # Test with crosstab method to trigger the cell_sum > 0 path + variables, total_records = records._extract_table_info(output, "crosstab") + + # Should extract variables + assert "idx" in variables + assert "cols" in variables + # With numeric data, cell_sum will be 100 (10+20+30+40), so total_records should be 100 + assert total_records == 100 + + +def test_extract_table_info_with_mixed_data(): + """Test _extract_table_info with mixed NaN and numeric data.""" + records = Records() + + # Create a table with some NaN and some numeric values + table = pd.DataFrame( + [[10, np.nan], [np.nan, 40]], + index=["row1", "row2"], + columns=["col1", "col2"], + ) + table.index.name = "idx" + + output = [table] + variables, total_records = records._extract_table_info(output, "crosstab") + + # Should extract variables + assert "idx" in variables + # cell_sum = 10 + 40 = 50, so total_records should be 50 + assert total_records == 50 + + +def test_generate_summary_with_crosstab(data, acro): + """Test generate_summary triggers _extract_table_info coverage for lines 456-465.""" + # Create actual crosstab output to trigger generate_summary path + _ = acro.crosstab(data.year, data.grant_type) + + # Call generate_summary which will call _extract_table_info + summary_df = acro.results.generate_summary() + + # Verify summary was generated + assert len(summary_df) > 0 + assert "variables" in summary_df.columns + assert "total_records" in summary_df.columns + # Should have extracted year and grant_type as variables + assert ( + "year" in summary_df.iloc[0]["variables"] + or "grant_type" in summary_df.iloc[0]["variables"] + ) From f3c6d96970ec2da6f3dbccd7b20a47b5cf0da3ff Mon Sep 17 00:00:00 2001 From: JessUWE Date: Thu, 19 Feb 2026 16:09:26 +0000 Subject: [PATCH 03/29] fix coverage --- test/test_initial.py | 85 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 4 deletions(-) diff --git a/test/test_initial.py b/test/test_initial.py index da36993a..a7056e1e 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -555,10 +555,14 @@ def test_add_to_acro(data, monkeypatch): src_path = "test_add_to_acro" dest_path = "sdc_results" file_path = "crosstab.pkl" - if not os.path.exists(src_path): # pragma no cover - table.to_pickle(file_path) - os.mkdir(src_path) - shutil.move(file_path, src_path, copy_function=shutil.copytree) + # ensure a clean state before setup + if os.path.exists(src_path): + shutil.rmtree(src_path) + if os.path.exists(dest_path): + shutil.rmtree(dest_path) + table.to_pickle(file_path) + os.mkdir(src_path) + shutil.move(file_path, src_path, copy_function=shutil.copytree) # add exception to the output exception = ["I want it"] monkeypatch.setattr("builtins.input", lambda _: exception.pop(0)) @@ -566,6 +570,8 @@ def test_add_to_acro(data, monkeypatch): add_to_acro(src_path, dest_path) assert "results.json" in os.listdir(dest_path) assert "crosstab.pkl" in os.listdir(dest_path) + shutil.rmtree(src_path) + shutil.rmtree(dest_path) def test_prettify_tablestring(data): @@ -1545,3 +1551,74 @@ def test_generate_summary_with_crosstab(data, acro): "year" in summary_df.iloc[0]["variables"] or "grant_type" in summary_df.iloc[0]["variables"] ) + + +def test_extract_table_info_elif_index_name(): + """Cover lines 456-457: elif table.index.name branch. + + Triggered when index.names has no truthy entries (any() is False) + but index.name is set. + """ + records = Records() + + table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"]) + table.index.name = "myindex" + + # Subclass the index type to override `names` so any(names) is False, + # while `name` still returns "myindex" (reads from _names[0]). + idx_type = type(table.index) + patched_idx = type("PatchedIdx", (idx_type,), {"names": property(lambda _: [None])}) + table.index.__class__ = patched_idx + + output = [table] + variables, total_records = records._extract_table_info(output, "linear") + + assert "myindex" in variables + + +def test_extract_table_info_elif_columns_name(): + """Cover line 465: elif table.columns.name branch. + + Triggered when columns.names has no truthy entries (any() is False) + but columns.name is set. + """ + records = Records() + + table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"]) + table.columns.name = "mycols" + + # Override index.names to avoid the `if` branch for index (no index name set). + # Override columns.names so any() is False, forcing the elif for columns. + col_type = type(table.columns) + patched_idx = type("PatchedIdx", (col_type,), {"names": property(lambda _: [None])}) + table.columns.__class__ = patched_idx + + output = [table] + variables, total_records = records._extract_table_info(output, "linear") + + assert "mycols" in variables + + +def test_extract_table_info_shape_type_error(): + """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info. + + Triggered when shape multiplication raises TypeError (e.g. shape returns + (None, None) so None * None raises TypeError). + """ + records = Records() + + table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"]) + table.index.name = "idx" + + class BrokenShapeDF(pd.DataFrame): + @property + def shape(self): + return (None, None) + + table.__class__ = BrokenShapeDF + + output = [table] + variables, total_records = records._extract_table_info(output, "linear") + + assert isinstance(variables, list) + assert total_records == 0 From f9ef74ec853564f062ce69cee8c78c22dd2ca912 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Thu, 19 Feb 2026 16:21:32 +0000 Subject: [PATCH 04/29] test: add coverage for empty summary edge case (line 595) --- test/test_initial.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/test_initial.py b/test/test_initial.py index a7056e1e..f04a07fc 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1622,3 +1622,17 @@ def shape(self): assert isinstance(variables, list) assert total_records == 0 + + +def test_write_summary_empty_session(tmp_path): + """Test write_summary when there are no outputs (line 595 coverage).""" + records = Records() + + # Create a temp directory for output + output_path = str(tmp_path / "empty_summary") + + # Call write_summary with no outputs - should return early without creating file + records.write_summary(output_path) + + # Verify no file was created since summary was empty + assert not os.path.exists(os.path.normpath(f"{output_path}/summary.csv")) From 47359b2fde398fb96a03d6ed28feb844327150af Mon Sep 17 00:00:00 2001 From: JessUWE Date: Thu, 19 Feb 2026 16:24:22 +0000 Subject: [PATCH 05/29] test: remove unused variable idx_type --- test/test_initial.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_initial.py b/test/test_initial.py index f04a07fc..98ce074f 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1566,8 +1566,11 @@ def test_extract_table_info_elif_index_name(): # Subclass the index type to override `names` so any(names) is False, # while `name` still returns "myindex" (reads from _names[0]). - idx_type = type(table.index) - patched_idx = type("PatchedIdx", (idx_type,), {"names": property(lambda _: [None])}) + patched_idx = type( + "PatchedIdx", + (type(table.index),), + {"names": property(lambda _: [None])}, + ) table.index.__class__ = patched_idx output = [table] From f817df23c882bc09e606d9be8df6fb96258ffe94 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Thu, 19 Feb 2026 16:28:59 +0000 Subject: [PATCH 06/29] test: fix unused variable warnings by using underscore --- test/test_initial.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_initial.py b/test/test_initial.py index 98ce074f..cb27c229 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1574,7 +1574,7 @@ def test_extract_table_info_elif_index_name(): table.index.__class__ = patched_idx output = [table] - variables, total_records = records._extract_table_info(output, "linear") + variables, _ = records._extract_table_info(output, "linear") assert "myindex" in variables @@ -1597,7 +1597,7 @@ def test_extract_table_info_elif_columns_name(): table.columns.__class__ = patched_idx output = [table] - variables, total_records = records._extract_table_info(output, "linear") + variables, _ = records._extract_table_info(output, "linear") assert "mycols" in variables From e8bc9267c4374452a66f8a65c3ece04e05a744d0 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Thu, 5 Mar 2026 09:38:56 +0000 Subject: [PATCH 07/29] feat: implement session summary with differencing risk detection - Add generate_summary() to provide high-level output overview for checkers - Add multi-layered 'DO NOT RELEASE' warnings (filename, comment) - Integrate session summary into JSON and Excel outputs Resolves #224 --- acro/record.py | 49 +++++++++++++++++++++++++++++--------------- test/test_initial.py | 24 +++++++++++----------- 2 files changed, 45 insertions(+), 28 deletions(-) diff --git a/acro/record.py b/acro/record.py index 94e00ae4..dea7238a 100644 --- a/acro/record.py +++ b/acro/record.py @@ -450,13 +450,11 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int for table in output: if isinstance(table, DataFrame): - # Extract index names if hasattr(table.index, "names") and any(table.index.names): variables.extend(str(n) for n in table.index.names if n is not None) elif table.index.name: variables.append(str(table.index.name)) - # Extract column names if hasattr(table.columns, "names") and any(table.columns.names): variables.extend( str(n) for n in table.columns.names if n is not None @@ -464,7 +462,6 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int elif table.columns.name: variables.append(str(table.columns.name)) - # Calculate total records try: total_records += int(table.shape[0] * table.shape[1]) if method == "crosstab": @@ -549,6 +546,8 @@ def generate_summary(self) -> DataFrame: """ rows = [] for uid, rec in self.results.items(): + if rec.output_type == "custom": + continue method = rec.properties.get("method", rec.output_type) variables: list[str] = [] total_records: int = 0 @@ -582,21 +581,28 @@ def generate_summary(self) -> DataFrame: return summary_df - def write_summary(self, path: str) -> None: - """Write the session summary to a CSV file. + def add_summary_to_results(self) -> None: + """Add the summary DataFrame as a custom output to results. - Parameters - ---------- - path : str - Name of the output folder. + This generates a summary of all outputs in the session with metadata + about variables, record counts, and differencing risk. The file is + marked with a clear warning not to release. """ summary_df = self.generate_summary() if summary_df.empty: return - os.makedirs(path, exist_ok=True) - filename = os.path.normpath(f"{path}/summary.csv") - summary_df.to_csv(filename, index=False) - logger.info("summary written to: %s", filename) + + os.makedirs("acro_artifacts", exist_ok=True) + # Use explicit filename to indicate this should not be released + summary_path = os.path.normpath( + "acro_artifacts/DO_NOT_RELEASE_session_summary.csv" + ) + summary_df.to_csv(summary_path, index=False) + + self.add_custom( + summary_path, + "⚠️ WARNING: DO NOT RELEASE - Session summary for output checker use only", + ) def finalise(self, path: str, ext: str, interactive: bool = False) -> None: """Create a results file for checking. @@ -613,13 +619,13 @@ def finalise(self, path: str, ext: str, interactive: bool = False) -> None: logger.debug("finalise()") if interactive: self.validate_outputs() + self.add_summary_to_results() if ext == "json": self.finalise_json(path) elif ext == "xlsx": self.finalise_excel(path) else: raise ValueError("Invalid file extension. Options: {json, xlsx}") - self.write_summary(path) self.write_checksums(path) # check if the directory acro_artifacts exists and delete it if os.path.exists("acro_artifacts"): @@ -653,7 +659,19 @@ def finalise_json(self, path: str) -> None: for file in files: outputs[key]["files"].append({"name": file, "sdc": val.sdc}) - results: dict = {"version": __version__, "results": outputs} + # Generate and include session summary for output checkers + summary_df = self.generate_summary() + session_summary = { + "DO_NOT_RELEASE": True, + "purpose": "Session summary for output checker use only", + "data": json.loads(summary_df.to_json(orient="records")), + } + + results: dict = { + "version": __version__, + "results": outputs, + "session_summary": session_summary, + } filename: str = os.path.normpath(f"{path}/results.json") try: with open(filename, "w", newline="", encoding="utf-8") as handle: @@ -703,7 +721,6 @@ def finalise_excel(self, path: str) -> None: {"Sheet": sheet, "Command": command, "Summary": summary} ) tmp_df.to_excel(writer, sheet_name="description", index=False, startrow=0) - # individual sheets for output_id, output in self.results.items(): if output.output_type == "custom": diff --git a/test/test_initial.py b/test/test_initial.py index cb27c229..c3e03ff5 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1259,12 +1259,14 @@ def test_toggle_suppression(): def test_summary_csv_created_with_json(data, acro): - """Test that summary.csv is created when finalising to JSON (Issue #224).""" + """Test that DO_NOT_RELEASE_session_summary.csv is created when finalising to JSON (Issue #224).""" _ = acro.crosstab(data.year, data.grant_type) acro.add_exception("output_0", "Let me have it") acro.finalise(PATH, "json") - summary_path = os.path.normpath(f"{PATH}/summary.csv") - assert os.path.exists(summary_path), "summary.csv was not created" + summary_path = os.path.normpath(f"{PATH}/DO_NOT_RELEASE_session_summary.csv") + assert os.path.exists(summary_path), ( + "DO_NOT_RELEASE_session_summary.csv was not created" + ) summary_df = pd.read_csv(summary_path) assert len(summary_df) == 1 assert "id" in summary_df.columns @@ -1306,7 +1308,7 @@ def test_summary_variables_extracted(data): _ = acro_obj.crosstab(data.year, data.grant_type) acro_obj.add_exception("output_0", "Let me have it") acro_obj.finalise(PATH, "json") - summary_path = os.path.normpath(f"{PATH}/summary.csv") + summary_path = os.path.normpath(f"{PATH}/DO_NOT_RELEASE_session_summary.csv") summary_df = pd.read_csv(summary_path) variables = summary_df.iloc[0]["variables"] assert "year" in variables @@ -1627,15 +1629,13 @@ def shape(self): assert total_records == 0 -def test_write_summary_empty_session(tmp_path): - """Test write_summary when there are no outputs (line 595 coverage).""" +def test_write_summary_empty_session(): + """Test add_summary_to_results when there are no outputs (line 595 coverage).""" records = Records() - # Create a temp directory for output - output_path = str(tmp_path / "empty_summary") - - # Call write_summary with no outputs - should return early without creating file - records.write_summary(output_path) + records.add_summary_to_results() # Verify no file was created since summary was empty - assert not os.path.exists(os.path.normpath(f"{output_path}/summary.csv")) + assert not os.path.exists( + os.path.normpath("acro_artifacts/DO_NOT_RELEASE_session_summary.csv") + ) From e3665ffe1afc445e22428cf7e7de995c68d29976 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Thu, 5 Mar 2026 11:03:28 +0000 Subject: [PATCH 08/29] feat(record): update warning message --- acro/record.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acro/record.py b/acro/record.py index dea7238a..5f17bfe9 100644 --- a/acro/record.py +++ b/acro/record.py @@ -601,7 +601,7 @@ def add_summary_to_results(self) -> None: self.add_custom( summary_path, - "⚠️ WARNING: DO NOT RELEASE - Session summary for output checker use only", + "WARNING: DO NOT RELEASE - Session summary for output checker use only", ) def finalise(self, path: str, ext: str, interactive: bool = False) -> None: From 404bba3e7b3af20c44834b3f6cfad58208adf5fe Mon Sep 17 00:00:00 2001 From: JessUWE Date: Thu, 5 Mar 2026 13:43:53 +0000 Subject: [PATCH 09/29] Add generate_summary() to provide high-level output overview for checkers - Add multi-layered 'DO NOT RELEASE' warnings (filename, comment) - Integrate session summary into JSON and Excel outputs --- acro/record.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/acro/record.py b/acro/record.py index b5d1ac6b..62cacf9d 100644 --- a/acro/record.py +++ b/acro/record.py @@ -463,16 +463,39 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int variables.append(str(table.columns.name)) try: - total_records += int(table.shape[0] * table.shape[1]) if method == "crosstab": - cell_sum = table.values[~pd.isna(table.values)].sum() - if cell_sum > 0: - total_records = int(cell_sum) + total_records = self._get_crosstab_record_count(table) + else: + total_records += int(table.shape[0] * table.shape[1]) except (TypeError, ValueError): pass return variables, total_records + def _get_crosstab_record_count(self, table: DataFrame) -> int: + """Get record count from crosstab, excluding margins. + + Parameters + ---------- + table : DataFrame + The crosstab table. + + Returns + ------- + int + The record count. + """ + working_table = table.copy() + if "All" in working_table.index: + working_table = working_table.drop("All", axis=0) + if "All" in working_table.columns: + working_table = working_table.drop("All", axis=1) + + cell_sum = working_table.values[~pd.isna(working_table.values)].sum() + if cell_sum > 0: + return int(cell_sum) + return int(table.shape[0] * table.shape[1]) + def _extract_regression_info(self, output: list) -> tuple[list[str], int]: """Extract variables and total records from regression output. From fcbe9fae65a990475f1f769e12fc69c670addd68 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Thu, 5 Mar 2026 13:51:46 +0000 Subject: [PATCH 10/29] - Add multi-layered 'DO NOT RELEASE' warnings (filename, comment) - Integrate session summary into JSON and Excel outputs --- test/test_initial.py | 123 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/test/test_initial.py b/test/test_initial.py index 49b63d4a..e572e52d 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1702,3 +1702,126 @@ def test_write_summary_empty_session(): assert not os.path.exists( os.path.normpath("acro_artifacts/DO_NOT_RELEASE_session_summary.csv") ) + + +def test_get_crosstab_record_count_with_margins(): + """Test _get_crosstab_record_count correctly excludes margin rows/columns. + + This tests lines 488-492 of record.py to ensure that "All" row and column + are properly excluded when calculating record counts from crosstabs. + """ + records = Records() + + # Create a table with margins (like what pandas crosstab produces) + table = pd.DataFrame( + [[5, 5, 10], [5, 5, 10], [10, 10, 20]], + index=pd.Index([2010, 2011, "All"], name="year"), + columns=pd.Index(["A", "B", "All"], name="grant_type") + ) + + # Call _get_crosstab_record_count + count = records._get_crosstab_record_count(table) + + # Should sum only the non-margin cells: 5+5+5+5 = 20 + # Should NOT include the "All" row or column: [10, 10, 20] from row and [10, 10, 20] from col + assert count == 20 + + +def test_get_crosstab_record_count_without_margins(): + """Test _get_crosstab_record_count with a table that has no margins. + + When there is no "All" row or column, should sum all cells normally. + """ + records = Records() + + # Create a table without margins + table = pd.DataFrame( + [[5, 5], [5, 5]], + index=pd.Index([2010, 2011], name="year"), + columns=pd.Index(["A", "B"], name="grant_type") + ) + + count = records._get_crosstab_record_count(table) + + # Should sum all cells: 5+5+5+5 = 20 + assert count == 20 + + +def test_get_crosstab_record_count_with_margin_row_only(): + """Test _get_crosstab_record_count when only margin row exists. + + This covers the case where "All" is in the index but not in columns. + """ + records = Records() + + # Create a table with margin row but not margin column + table = pd.DataFrame( + [[5, 5], [5, 5], [10, 10]], + index=pd.Index([2010, 2011, "All"], name="year"), + columns=pd.Index(["A", "B"], name="grant_type") + ) + + count = records._get_crosstab_record_count(table) + + # Should exclude "All" row and sum: 5+5+5+5 = 20 + assert count == 20 + + +def test_get_crosstab_record_count_with_margin_column_only(): + """Test _get_crosstab_record_count when only margin column exists. + + This covers the case where "All" is in columns but not in the index. + """ + records = Records() + + # Create a table with margin column but not margin row + table = pd.DataFrame( + [[5, 5, 10], [5, 5, 10]], + index=pd.Index([2010, 2011], name="year"), + columns=pd.Index(["A", "B", "All"], name="grant_type") + ) + + count = records._get_crosstab_record_count(table) + + # Should exclude "All" column and sum: 5+5+5+5 = 20 + assert count == 20 + + +def test_get_crosstab_record_count_with_nan_values(): + """Test _get_crosstab_record_count correctly handles NaN values. + + This tests the isna() check when summing cell values (line 493). + """ + records = Records() + + # Create a table with some NaN values (suppressed cells) + table = pd.DataFrame( + [[np.nan, 5, 5], [5, np.nan, 5], [10, 10, 20]], + index=pd.Index([2010, 2011, "All"], name="year"), + columns=pd.Index(["A", "B", "C"], name="grant_type") + ) + + count = records._get_crosstab_record_count(table) + + # Should sum only non-NaN, non-margin cells: 5+5+5+5 = 20 + assert count == 20 + + +def test_get_crosstab_record_count_fallback(): + """Test _get_crosstab_record_count fallback when cell_sum is 0. + + This tests lines 494-495 where fallback calculation is used. + """ + records = Records() + + # Create a table with all zeros or NaN (simulating all suppressed) + table = pd.DataFrame( + [[np.nan, np.nan], [np.nan, np.nan]], + index=pd.Index([2010, 2011], name="year"), + columns=pd.Index(["A", "B"], name="grant_type") + ) + + count = records._get_crosstab_record_count(table) + + # Should use fallback: table.shape[0] * table.shape[1] = 2 * 2 = 4 + assert count == 4 From 4c16dcd7cb4521d59142ba441aa7552014302fe7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Mar 2026 13:52:12 +0000 Subject: [PATCH 11/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/test_initial.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_initial.py b/test/test_initial.py index e572e52d..14f49d9b 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1716,7 +1716,7 @@ def test_get_crosstab_record_count_with_margins(): table = pd.DataFrame( [[5, 5, 10], [5, 5, 10], [10, 10, 20]], index=pd.Index([2010, 2011, "All"], name="year"), - columns=pd.Index(["A", "B", "All"], name="grant_type") + columns=pd.Index(["A", "B", "All"], name="grant_type"), ) # Call _get_crosstab_record_count @@ -1738,7 +1738,7 @@ def test_get_crosstab_record_count_without_margins(): table = pd.DataFrame( [[5, 5], [5, 5]], index=pd.Index([2010, 2011], name="year"), - columns=pd.Index(["A", "B"], name="grant_type") + columns=pd.Index(["A", "B"], name="grant_type"), ) count = records._get_crosstab_record_count(table) @@ -1758,7 +1758,7 @@ def test_get_crosstab_record_count_with_margin_row_only(): table = pd.DataFrame( [[5, 5], [5, 5], [10, 10]], index=pd.Index([2010, 2011, "All"], name="year"), - columns=pd.Index(["A", "B"], name="grant_type") + columns=pd.Index(["A", "B"], name="grant_type"), ) count = records._get_crosstab_record_count(table) @@ -1778,7 +1778,7 @@ def test_get_crosstab_record_count_with_margin_column_only(): table = pd.DataFrame( [[5, 5, 10], [5, 5, 10]], index=pd.Index([2010, 2011], name="year"), - columns=pd.Index(["A", "B", "All"], name="grant_type") + columns=pd.Index(["A", "B", "All"], name="grant_type"), ) count = records._get_crosstab_record_count(table) @@ -1798,7 +1798,7 @@ def test_get_crosstab_record_count_with_nan_values(): table = pd.DataFrame( [[np.nan, 5, 5], [5, np.nan, 5], [10, 10, 20]], index=pd.Index([2010, 2011, "All"], name="year"), - columns=pd.Index(["A", "B", "C"], name="grant_type") + columns=pd.Index(["A", "B", "C"], name="grant_type"), ) count = records._get_crosstab_record_count(table) @@ -1818,7 +1818,7 @@ def test_get_crosstab_record_count_fallback(): table = pd.DataFrame( [[np.nan, np.nan], [np.nan, np.nan]], index=pd.Index([2010, 2011], name="year"), - columns=pd.Index(["A", "B"], name="grant_type") + columns=pd.Index(["A", "B"], name="grant_type"), ) count = records._get_crosstab_record_count(table) From e1271ae43a93d3016ef7cc20465e34ca50324016 Mon Sep 17 00:00:00 2001 From: Jessica Ikechukwu Date: Fri, 6 Mar 2026 06:00:13 +0000 Subject: [PATCH 12/29] Remove tests for index and columns name extraction Remove tests for extracting table info based on index and columns names. Signed-off-by: Jessica Ikechukwu --- test/test_initial.py | 48 -------------------------------------------- 1 file changed, 48 deletions(-) diff --git a/test/test_initial.py b/test/test_initial.py index 14f49d9b..e1c9f8ee 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1618,54 +1618,6 @@ def test_generate_summary_with_crosstab(data, acro): ) -def test_extract_table_info_elif_index_name(): - """Cover lines 456-457: elif table.index.name branch. - - Triggered when index.names has no truthy entries (any() is False) - but index.name is set. - """ - records = Records() - - table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"]) - table.index.name = "myindex" - - # Subclass the index type to override `names` so any(names) is False, - # while `name` still returns "myindex" (reads from _names[0]). - patched_idx = type( - "PatchedIdx", - (type(table.index),), - {"names": property(lambda _: [None])}, - ) - table.index.__class__ = patched_idx - - output = [table] - variables, _ = records._extract_table_info(output, "linear") - - assert "myindex" in variables - - -def test_extract_table_info_elif_columns_name(): - """Cover line 465: elif table.columns.name branch. - - Triggered when columns.names has no truthy entries (any() is False) - but columns.name is set. - """ - records = Records() - - table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"]) - table.columns.name = "mycols" - - # Override index.names to avoid the `if` branch for index (no index name set). - # Override columns.names so any() is False, forcing the elif for columns. - col_type = type(table.columns) - patched_idx = type("PatchedIdx", (col_type,), {"names": property(lambda _: [None])}) - table.columns.__class__ = patched_idx - - output = [table] - variables, _ = records._extract_table_info(output, "linear") - - assert "mycols" in variables - def test_extract_table_info_shape_type_error(): """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info. From 6a96fd21ab3cdf32ca5ae27a170c2e39a4d46b8a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 6 Mar 2026 06:00:27 +0000 Subject: [PATCH 13/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/test_initial.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_initial.py b/test/test_initial.py index e1c9f8ee..a56fb7d0 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1618,7 +1618,6 @@ def test_generate_summary_with_crosstab(data, acro): ) - def test_extract_table_info_shape_type_error(): """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info. From c2cd7b6b3c0721ea79d322b510caf7713eceeba1 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Fri, 6 Mar 2026 10:09:57 +0000 Subject: [PATCH 14/29] refactor: remove unreachable elif branches and unnecessary tests - All core functionality preserved and tested with real DataFrame scenarios --- acro/record.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/acro/record.py b/acro/record.py index 62cacf9d..4e896cfd 100644 --- a/acro/record.py +++ b/acro/record.py @@ -452,15 +452,11 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int if isinstance(table, DataFrame): if hasattr(table.index, "names") and any(table.index.names): variables.extend(str(n) for n in table.index.names if n is not None) - elif table.index.name: - variables.append(str(table.index.name)) if hasattr(table.columns, "names") and any(table.columns.names): variables.extend( str(n) for n in table.columns.names if n is not None ) - elif table.columns.name: - variables.append(str(table.columns.name)) try: if method == "crosstab": From ee4652884225269283f1435440f2d519da5380b3 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Tue, 10 Mar 2026 10:33:07 +0000 Subject: [PATCH 15/29] Fixes issue where tables with identical variables but different suppression settings were not being flagged as having differencing risk. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes: •⁠ ⁠Simplified _extract_table_info to use consistent cell counting •⁠ ⁠Fixed suppression state extraction from sdc["summary"]["suppressed"] •⁠ ⁠Updated _mark_diff_risk to detect different suppression settings •⁠ ⁠Removed unnecessary _get_crosstab_record_count method --- acro/record.py | 159 ++++++++++++++++++++++------ test/test_initial.py | 247 ++++++++++++++----------------------------- 2 files changed, 204 insertions(+), 202 deletions(-) diff --git a/acro/record.py b/acro/record.py index 4e896cfd..62eceb8c 100644 --- a/acro/record.py +++ b/acro/record.py @@ -430,7 +430,12 @@ def validate_outputs(self) -> None: ) record.exception = input("") - def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int]: + def _extract_table_info( + self, + output: list, + method: str, # noqa: ARG002 + properties: dict, # noqa: ARG002 + ) -> tuple[list[str], int]: """Extract variables and total records from table output. Parameters @@ -439,6 +444,8 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int The output to extract information from. method : str The method used to generate the output. + properties : dict + Properties dictionary (unused but kept for compatibility). Returns ------- @@ -459,39 +466,17 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int ) try: - if method == "crosstab": - total_records = self._get_crosstab_record_count(table) + # Count non-NaN cells for record count + cell_sum = table.values[~pd.isna(table.values)].sum() + if cell_sum > 0: + total_records = int(cell_sum) else: - total_records += int(table.shape[0] * table.shape[1]) + total_records = int(table.shape[0] * table.shape[1]) except (TypeError, ValueError): pass return variables, total_records - def _get_crosstab_record_count(self, table: DataFrame) -> int: - """Get record count from crosstab, excluding margins. - - Parameters - ---------- - table : DataFrame - The crosstab table. - - Returns - ------- - int - The record count. - """ - working_table = table.copy() - if "All" in working_table.index: - working_table = working_table.drop("All", axis=0) - if "All" in working_table.columns: - working_table = working_table.drop("All", axis=1) - - cell_sum = working_table.values[~pd.isna(working_table.values)].sum() - if cell_sum > 0: - return int(cell_sum) - return int(table.shape[0] * table.shape[1]) - def _extract_regression_info(self, output: list) -> tuple[list[str], int]: """Extract variables and total records from regression output. @@ -544,12 +529,113 @@ def _mark_diff_risk(self, summary_df: DataFrame) -> DataFrame: if not table_outputs.empty: for _, group in table_outputs.groupby("variables"): if len(group) > 1: - counts = group["total_records"].unique() - if len(counts) > 1: + # Check for different suppression settings + suppressions = group["suppression"].unique() + # Risk if same variables with different suppression settings + if len(suppressions) > 1: summary_df.loc[group.index, "diff_risk"] = True return summary_df + def _extract_all_variables(self) -> list[str]: + """Extract all unique variables across all outputs. + + Returns + ------- + list[str] + Sorted list of unique variable names. + """ + all_variables: set[str] = set() + + for rec in self.results.values(): + if rec.output_type == "custom": + continue + variables = self._get_output_variables(rec) + all_variables.update(variables) + + return sorted(all_variables) + + def _get_output_variables(self, rec) -> list[str]: + """Extract variables from a single record. + + Parameters + ---------- + rec : Record + The record to extract variables from. + + Returns + ------- + list[str] + List of variable names. + """ + method = rec.properties.get("method", rec.output_type) + variables: list[str] = [] + + if rec.output_type == "table": + variables, _ = self._extract_table_info(rec.output, method, rec.properties) + elif rec.output_type == "regression": + variables, _ = self._extract_regression_info(rec.output) + dof = rec.properties.get("dof") + if dof is not None: + variables.append(f"dof={dof}") + + return variables + + + def _build_variable_matrix(self, summary_df: DataFrame) -> DataFrame: + """Build a variable-output matrix showing variable usage. + + Parameters + ---------- + summary_df : DataFrame + The base summary DataFrame. + + Returns + ------- + DataFrame + Summary with binary variable columns added. + """ + all_variables = self._extract_all_variables() + + if not all_variables: + return summary_df + + # Create binary columns for each variable + for var in all_variables: + summary_df[var] = summary_df["variables"].apply( + lambda vars_str: 1 if var in vars_str.split("; ") else 0 + ) + + return summary_df + + def generate_variable_matrix_table(self) -> DataFrame: + """Generate a clean variable-output matrix table for PHS requirements. + + Creates a table with one row per output and one column per variable, + plus an output_type column. Binary values indicate variable usage. + + Returns + ------- + DataFrame + Variable matrix table with columns: output_type, var1, var2, ... + """ + all_variables = self._extract_all_variables() + matrix_rows = [] + + for uid, rec in self.results.items(): + if rec.output_type == "custom": + continue + + variables = self._get_output_variables(rec) + + row = {"output_id": uid, "output_type": rec.output_type} + for var in all_variables: + row[var] = 1 if var in variables else 0 + + matrix_rows.append(row) + + return DataFrame(matrix_rows) + def generate_summary(self) -> DataFrame: """Generate a summary DataFrame of all outputs in the session. @@ -561,7 +647,8 @@ def generate_summary(self) -> DataFrame: ------- DataFrame Summary of all outputs with columns: id, method, status, type, - command, summary, variables, total_records, timestamp, diff_risk. + command, summary, variables, total_records, suppression, + timestamp, diff_risk. """ rows = [] for uid, rec in self.results.items(): @@ -572,7 +659,9 @@ def generate_summary(self) -> DataFrame: total_records: int = 0 if rec.output_type == "table": - variables, total_records = self._extract_table_info(rec.output, method) + variables, total_records = self._extract_table_info( + rec.output, method, rec.properties + ) elif rec.output_type == "regression": variables, total_records = self._extract_regression_info(rec.output) dof = rec.properties.get("dof") @@ -581,6 +670,10 @@ def generate_summary(self) -> DataFrame: variables_str = "; ".join(variables) if variables else "" + suppression = False + if isinstance(rec.sdc, dict) and "summary" in rec.sdc: + suppression = rec.sdc["summary"].get("suppressed", False) + rows.append( { "id": uid, @@ -591,12 +684,14 @@ def generate_summary(self) -> DataFrame: "summary": rec.summary, "variables": variables_str, "total_records": total_records, + "suppression": suppression, "timestamp": rec.timestamp, } ) summary_df = DataFrame(rows) summary_df = self._mark_diff_risk(summary_df) + summary_df = self._build_variable_matrix(summary_df) return summary_df diff --git a/test/test_initial.py b/test/test_initial.py index a56fb7d0..54c0917b 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1336,6 +1336,7 @@ def test_summary_csv_created_with_json(data, acro): assert "method" in summary_df.columns assert "variables" in summary_df.columns assert "total_records" in summary_df.columns + assert "suppression" in summary_df.columns assert "diff_risk" in summary_df.columns # check values assert summary_df.iloc[0]["id"] == "output_0" @@ -1380,20 +1381,24 @@ def test_summary_variables_extracted(data): def test_summary_differencing_risk(data): - """Test that differencing risk is flagged when tables share variables but have different record counts (Issue #224).""" + """Test that differencing risk is flagged when tables share variables but have different suppression settings (Issue #224).""" acro_obj = ACRO(suppress=True) _ = acro_obj.crosstab(data.year, data.grant_type) - data_subset = data.iloc[10:].copy() - _ = acro_obj.crosstab(data_subset.year, data_subset.grant_type) + acro_obj.suppress = False + _ = acro_obj.crosstab(data.year, data.grant_type) acro_obj.add_exception("output_0", "Let me have it") acro_obj.add_exception("output_1", "I need this") summary_df = acro_obj.results.generate_summary() + # Both outputs should have same variables assert summary_df.iloc[0]["variables"] == summary_df.iloc[1]["variables"] - assert summary_df.iloc[0]["total_records"] != summary_df.iloc[1]["total_records"] + # But different suppression settings + assert summary_df.iloc[0]["suppression"] == True + assert summary_df.iloc[1]["suppression"] == False + # Both should be flagged as diff_risk assert summary_df.iloc[0]["diff_risk"] == True assert summary_df.iloc[1]["diff_risk"] == True @@ -1428,21 +1433,14 @@ def test_extract_table_info(data, acro): acro.crosstab(data.year, data.grant_type) output = acro.results.get_index(0) - # Test extracting info from table output with crosstab method + # Test extracting info from table output variables, total_records = acro.results._extract_table_info( - output.output, "crosstab" + output.output, "crosstab", output.properties ) assert len(variables) > 0 assert total_records > 0 assert "year" in variables or "grant_type" in variables - # Also test with non-crosstab method to ensure both paths work - variables2, total_records2 = acro.results._extract_table_info( - output.output, "pivot_table" - ) - assert len(variables2) > 0 - assert total_records2 > 0 - def test_extract_regression_info(): """Test the _extract_regression_info helper method with sample data.""" @@ -1499,7 +1497,9 @@ def test_extract_table_info_with_zero_cell_sum(): table.columns.name = "cols" output = [table] - variables, total_records = records._extract_table_info(output, "crosstab") + variables, total_records = records._extract_table_info( + output, "crosstab", {"margins": False} + ) # Should extract variables and use shape-based count when cell_sum is 0 assert "idx" in variables @@ -1517,7 +1517,9 @@ def __init__(self): self.shape = "invalid" # Non-integer shape to trigger TypeError output = [InvalidTable()] - variables, total_records = records._extract_table_info(output, "crosstab") + variables, total_records = records._extract_table_info( + output, "crosstab", {"margins": False} + ) # Should handle exception gracefully assert isinstance(variables, list) @@ -1569,7 +1571,9 @@ def test_extract_table_info_with_numeric_data(): output = [table] # Test with crosstab method to trigger the cell_sum > 0 path - variables, total_records = records._extract_table_info(output, "crosstab") + variables, total_records = records._extract_table_info( + output, "crosstab", {"margins": False} + ) # Should extract variables assert "idx" in variables @@ -1591,7 +1595,9 @@ def test_extract_table_info_with_mixed_data(): table.index.name = "idx" output = [table] - variables, total_records = records._extract_table_info(output, "crosstab") + variables, total_records = records._extract_table_info( + output, "crosstab", {"margins": False} + ) # Should extract variables assert "idx" in variables @@ -1599,30 +1605,14 @@ def test_extract_table_info_with_mixed_data(): assert total_records == 50 -def test_generate_summary_with_crosstab(data, acro): - """Test generate_summary triggers _extract_table_info coverage for lines 456-465.""" - # Create actual crosstab output to trigger generate_summary path - _ = acro.crosstab(data.year, data.grant_type) - - # Call generate_summary which will call _extract_table_info - summary_df = acro.results.generate_summary() - - # Verify summary was generated - assert len(summary_df) > 0 - assert "variables" in summary_df.columns - assert "total_records" in summary_df.columns - # Should have extracted year and grant_type as variables - assert ( - "year" in summary_df.iloc[0]["variables"] - or "grant_type" in summary_df.iloc[0]["variables"] - ) - - def test_extract_table_info_shape_type_error(): """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info. Triggered when shape multiplication raises TypeError (e.g. shape returns (None, None) so None * None raises TypeError). + + Note: The current implementation successfully sums cell values even when + shape is broken, returning the cell sum (10 = 1+2+3+4) rather than 0. """ records = Records() @@ -1637,142 +1627,59 @@ def shape(self): table.__class__ = BrokenShapeDF output = [table] - variables, total_records = records._extract_table_info(output, "linear") - - assert isinstance(variables, list) - assert total_records == 0 - - -def test_write_summary_empty_session(): - """Test add_summary_to_results when there are no outputs (line 595 coverage).""" - records = Records() - - records.add_summary_to_results() - - # Verify no file was created since summary was empty - assert not os.path.exists( - os.path.normpath("acro_artifacts/DO_NOT_RELEASE_session_summary.csv") - ) - - -def test_get_crosstab_record_count_with_margins(): - """Test _get_crosstab_record_count correctly excludes margin rows/columns. - - This tests lines 488-492 of record.py to ensure that "All" row and column - are properly excluded when calculating record counts from crosstabs. - """ - records = Records() - - # Create a table with margins (like what pandas crosstab produces) - table = pd.DataFrame( - [[5, 5, 10], [5, 5, 10], [10, 10, 20]], - index=pd.Index([2010, 2011, "All"], name="year"), - columns=pd.Index(["A", "B", "All"], name="grant_type"), - ) - - # Call _get_crosstab_record_count - count = records._get_crosstab_record_count(table) - - # Should sum only the non-margin cells: 5+5+5+5 = 20 - # Should NOT include the "All" row or column: [10, 10, 20] from row and [10, 10, 20] from col - assert count == 20 - - -def test_get_crosstab_record_count_without_margins(): - """Test _get_crosstab_record_count with a table that has no margins. - - When there is no "All" row or column, should sum all cells normally. - """ - records = Records() - - # Create a table without margins - table = pd.DataFrame( - [[5, 5], [5, 5]], - index=pd.Index([2010, 2011], name="year"), - columns=pd.Index(["A", "B"], name="grant_type"), - ) - - count = records._get_crosstab_record_count(table) - - # Should sum all cells: 5+5+5+5 = 20 - assert count == 20 - - -def test_get_crosstab_record_count_with_margin_row_only(): - """Test _get_crosstab_record_count when only margin row exists. - - This covers the case where "All" is in the index but not in columns. - """ - records = Records() - - # Create a table with margin row but not margin column - table = pd.DataFrame( - [[5, 5], [5, 5], [10, 10]], - index=pd.Index([2010, 2011, "All"], name="year"), - columns=pd.Index(["A", "B"], name="grant_type"), - ) - - count = records._get_crosstab_record_count(table) - - # Should exclude "All" row and sum: 5+5+5+5 = 20 - assert count == 20 - - -def test_get_crosstab_record_count_with_margin_column_only(): - """Test _get_crosstab_record_count when only margin column exists. - - This covers the case where "All" is in columns but not in the index. - """ - records = Records() - - # Create a table with margin column but not margin row - table = pd.DataFrame( - [[5, 5, 10], [5, 5, 10]], - index=pd.Index([2010, 2011], name="year"), - columns=pd.Index(["A", "B", "All"], name="grant_type"), + variables, total_records = records._extract_table_info( + output, "linear", {"margins": False} ) - count = records._get_crosstab_record_count(table) - - # Should exclude "All" column and sum: 5+5+5+5 = 20 - assert count == 20 - - -def test_get_crosstab_record_count_with_nan_values(): - """Test _get_crosstab_record_count correctly handles NaN values. - - This tests the isna() check when summing cell values (line 493). - """ - records = Records() - - # Create a table with some NaN values (suppressed cells) - table = pd.DataFrame( - [[np.nan, 5, 5], [5, np.nan, 5], [10, 10, 20]], - index=pd.Index([2010, 2011, "All"], name="year"), - columns=pd.Index(["A", "B", "C"], name="grant_type"), - ) - - count = records._get_crosstab_record_count(table) - - # Should sum only non-NaN, non-margin cells: 5+5+5+5 = 20 - assert count == 20 - + assert isinstance(variables, list) + assert total_records == 10 # Sum of cell values: 1+2+3+4 -def test_get_crosstab_record_count_fallback(): - """Test _get_crosstab_record_count fallback when cell_sum is 0. - This tests lines 494-495 where fallback calculation is used. +def test_generate_variable_matrix_table(): + """Test the PHS-format variable matrix table. + + Should have one row per output and one column per variable, + with binary values indicating variable presence. """ - records = Records() - - # Create a table with all zeros or NaN (simulating all suppressed) - table = pd.DataFrame( - [[np.nan, np.nan], [np.nan, np.nan]], - index=pd.Index([2010, 2011], name="year"), - columns=pd.Index(["A", "B"], name="grant_type"), - ) - - count = records._get_crosstab_record_count(table) - - # Should use fallback: table.shape[0] * table.shape[1] = 2 * 2 = 4 - assert count == 4 + acro_obj = ACRO(suppress=False) + + data = pd.DataFrame({ + 'year': [2010]*10 + [2011]*10, + 'grant_type': ['A']*5 + ['B']*5 + ['A']*5 + ['B']*5, + 'region': ['North']*5 + ['South']*5 + ['North']*5 + ['South']*5, + }) + + # Create three outputs with different variable combinations + _ = acro_obj.crosstab(data.year, data.grant_type) + _ = acro_obj.crosstab(data.year, data.region) + _ = acro_obj.crosstab(data.grant_type, data.region) + + var_matrix = acro_obj.results.generate_variable_matrix_table() + + + assert 'output_id' in var_matrix.columns + assert 'output_type' in var_matrix.columns + assert len(var_matrix) == 3 + + assert 'year' in var_matrix.columns + assert 'grant_type' in var_matrix.columns + assert 'region' in var_matrix.columns + + + for row in var_matrix.itertuples(): + assert row.year in [0, 1] + assert row.grant_type in [0, 1] + assert row.region in [0, 1] + + + assert var_matrix.iloc[0]['year'] == 1 + assert var_matrix.iloc[0]['grant_type'] == 1 + assert var_matrix.iloc[0]['region'] == 0 + + assert var_matrix.iloc[1]['year'] == 1 + assert var_matrix.iloc[1]['grant_type'] == 0 + assert var_matrix.iloc[1]['region'] == 1 + + assert var_matrix.iloc[2]['year'] == 0 + assert var_matrix.iloc[2]['grant_type'] == 1 + assert var_matrix.iloc[2]['region'] == 1 From 0202e392df46ac3d353347e13765d5cc8a3faf68 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Mar 2026 10:33:52 +0000 Subject: [PATCH 16/29] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- acro/record.py | 5 ++-- test/test_initial.py | 69 ++++++++++++++++++++++---------------------- 2 files changed, 36 insertions(+), 38 deletions(-) diff --git a/acro/record.py b/acro/record.py index 62eceb8c..011b7dc3 100644 --- a/acro/record.py +++ b/acro/record.py @@ -557,12 +557,12 @@ def _extract_all_variables(self) -> list[str]: def _get_output_variables(self, rec) -> list[str]: """Extract variables from a single record. - + Parameters ---------- rec : Record The record to extract variables from. - + Returns ------- list[str] @@ -581,7 +581,6 @@ def _get_output_variables(self, rec) -> list[str]: return variables - def _build_variable_matrix(self, summary_df: DataFrame) -> DataFrame: """Build a variable-output matrix showing variable usage. diff --git a/test/test_initial.py b/test/test_initial.py index 54c0917b..c71c387e 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1610,7 +1610,7 @@ def test_extract_table_info_shape_type_error(): Triggered when shape multiplication raises TypeError (e.g. shape returns (None, None) so None * None raises TypeError). - + Note: The current implementation successfully sums cell values even when shape is broken, returning the cell sum (10 = 1+2+3+4) rather than 0. """ @@ -1637,49 +1637,48 @@ def shape(self): def test_generate_variable_matrix_table(): """Test the PHS-format variable matrix table. - + Should have one row per output and one column per variable, with binary values indicating variable presence. """ acro_obj = ACRO(suppress=False) - - data = pd.DataFrame({ - 'year': [2010]*10 + [2011]*10, - 'grant_type': ['A']*5 + ['B']*5 + ['A']*5 + ['B']*5, - 'region': ['North']*5 + ['South']*5 + ['North']*5 + ['South']*5, - }) - + + data = pd.DataFrame( + { + "year": [2010] * 10 + [2011] * 10, + "grant_type": ["A"] * 5 + ["B"] * 5 + ["A"] * 5 + ["B"] * 5, + "region": ["North"] * 5 + ["South"] * 5 + ["North"] * 5 + ["South"] * 5, + } + ) + # Create three outputs with different variable combinations - _ = acro_obj.crosstab(data.year, data.grant_type) - _ = acro_obj.crosstab(data.year, data.region) - _ = acro_obj.crosstab(data.grant_type, data.region) - + _ = acro_obj.crosstab(data.year, data.grant_type) + _ = acro_obj.crosstab(data.year, data.region) + _ = acro_obj.crosstab(data.grant_type, data.region) + var_matrix = acro_obj.results.generate_variable_matrix_table() - - assert 'output_id' in var_matrix.columns - assert 'output_type' in var_matrix.columns - assert len(var_matrix) == 3 - - assert 'year' in var_matrix.columns - assert 'grant_type' in var_matrix.columns - assert 'region' in var_matrix.columns - + assert "output_id" in var_matrix.columns + assert "output_type" in var_matrix.columns + assert len(var_matrix) == 3 + + assert "year" in var_matrix.columns + assert "grant_type" in var_matrix.columns + assert "region" in var_matrix.columns for row in var_matrix.itertuples(): assert row.year in [0, 1] assert row.grant_type in [0, 1] assert row.region in [0, 1] - - - assert var_matrix.iloc[0]['year'] == 1 - assert var_matrix.iloc[0]['grant_type'] == 1 - assert var_matrix.iloc[0]['region'] == 0 - - assert var_matrix.iloc[1]['year'] == 1 - assert var_matrix.iloc[1]['grant_type'] == 0 - assert var_matrix.iloc[1]['region'] == 1 - - assert var_matrix.iloc[2]['year'] == 0 - assert var_matrix.iloc[2]['grant_type'] == 1 - assert var_matrix.iloc[2]['region'] == 1 + + assert var_matrix.iloc[0]["year"] == 1 + assert var_matrix.iloc[0]["grant_type"] == 1 + assert var_matrix.iloc[0]["region"] == 0 + + assert var_matrix.iloc[1]["year"] == 1 + assert var_matrix.iloc[1]["grant_type"] == 0 + assert var_matrix.iloc[1]["region"] == 1 + + assert var_matrix.iloc[2]["year"] == 0 + assert var_matrix.iloc[2]["grant_type"] == 1 + assert var_matrix.iloc[2]["region"] == 1 From 61ab3077b0378fd60a8e72be6587d64169411eb4 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Tue, 10 Mar 2026 10:37:25 +0000 Subject: [PATCH 17/29] Fixes issue where tables with identical variables but different suppression settings were not being flagged as having differencing risk. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes: •⁠ ⁠Simplified _extract_table_info to use consistent cell counting •⁠ ⁠Fixed suppression state extraction from sdc["summary"]["suppressed"] •⁠ ⁠Updated _mark_diff_risk to detect different suppression settings •⁠ ⁠Removed unnecessary _get_crosstab_record_count method --- acro/record.py | 158 +++++++++++++++++++++++------- test/test_initial.py | 224 +++++++++++++------------------------------ 2 files changed, 191 insertions(+), 191 deletions(-) diff --git a/acro/record.py b/acro/record.py index 4e896cfd..2086794b 100644 --- a/acro/record.py +++ b/acro/record.py @@ -430,7 +430,12 @@ def validate_outputs(self) -> None: ) record.exception = input("") - def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int]: + def _extract_table_info( + self, + output: list, + method: str, # noqa: ARG002 + properties: dict, # noqa: ARG002 + ) -> tuple[list[str], int]: """Extract variables and total records from table output. Parameters @@ -439,6 +444,8 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int The output to extract information from. method : str The method used to generate the output. + properties : dict + Properties dictionary (unused but kept for compatibility). Returns ------- @@ -459,39 +466,17 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int ) try: - if method == "crosstab": - total_records = self._get_crosstab_record_count(table) + # Count non-NaN cells for record count + cell_sum = table.values[~pd.isna(table.values)].sum() + if cell_sum > 0: + total_records = int(cell_sum) else: - total_records += int(table.shape[0] * table.shape[1]) + total_records = int(table.shape[0] * table.shape[1]) except (TypeError, ValueError): pass return variables, total_records - def _get_crosstab_record_count(self, table: DataFrame) -> int: - """Get record count from crosstab, excluding margins. - - Parameters - ---------- - table : DataFrame - The crosstab table. - - Returns - ------- - int - The record count. - """ - working_table = table.copy() - if "All" in working_table.index: - working_table = working_table.drop("All", axis=0) - if "All" in working_table.columns: - working_table = working_table.drop("All", axis=1) - - cell_sum = working_table.values[~pd.isna(working_table.values)].sum() - if cell_sum > 0: - return int(cell_sum) - return int(table.shape[0] * table.shape[1]) - def _extract_regression_info(self, output: list) -> tuple[list[str], int]: """Extract variables and total records from regression output. @@ -544,12 +529,112 @@ def _mark_diff_risk(self, summary_df: DataFrame) -> DataFrame: if not table_outputs.empty: for _, group in table_outputs.groupby("variables"): if len(group) > 1: - counts = group["total_records"].unique() - if len(counts) > 1: + # Check for different suppression settings + suppressions = group["suppression"].unique() + # Risk if same variables with different suppression settings + if len(suppressions) > 1: summary_df.loc[group.index, "diff_risk"] = True return summary_df + def _extract_all_variables(self) -> list[str]: + """Extract all unique variables across all outputs. + + Returns + ------- + list[str] + Sorted list of unique variable names. + """ + all_variables: set[str] = set() + + for rec in self.results.values(): + if rec.output_type == "custom": + continue + variables = self._get_output_variables(rec) + all_variables.update(variables) + + return sorted(all_variables) + + def _get_output_variables(self, rec: Record) -> list[str]: + """Extract variables from a single record. + + Parameters + ---------- + rec : Record + The record to extract variables from. + + Returns + ------- + list[str] + List of variable names. + """ + method = rec.properties.get("method", rec.output_type) + variables: list[str] = [] + + if rec.output_type == "table": + variables, _ = self._extract_table_info(rec.output, method, rec.properties) + elif rec.output_type == "regression": + variables, _ = self._extract_regression_info(rec.output) + dof = rec.properties.get("dof") + if dof is not None: + variables.append(f"dof={dof}") + + return variables + + def _build_variable_matrix(self, summary_df: DataFrame) -> DataFrame: + """Build a variable-output matrix showing variable usage. + + Parameters + ---------- + summary_df : DataFrame + The base summary DataFrame. + + Returns + ------- + DataFrame + Summary with binary variable columns added. + """ + all_variables = self._extract_all_variables() + + if not all_variables: + return summary_df + + # Create binary columns for each variable + for var in all_variables: + summary_df[var] = summary_df["variables"].apply( + lambda vars_str, v=var: 1 if v in vars_str.split("; ") else 0 + ) + + return summary_df + + def generate_variable_matrix_table(self) -> DataFrame: + """Generate a clean variable-output matrix table for PHS requirements. + + Creates a table with one row per output and one column per variable, + plus an output_type column. Binary values indicate variable usage. + + Returns + ------- + DataFrame + Variable matrix table with columns: output_type, var1, var2, ... + """ + all_variables = self._extract_all_variables() + matrix_rows = [] + + for uid, rec in self.results.items(): + if rec.output_type == "custom": + continue + + variables = self._get_output_variables(rec) + + row: dict[str, Any] = {"output_id": uid, "output_type": rec.output_type} + for var in all_variables: + row[var] = 1 if var in variables else 0 + + matrix_rows.append(row) + + return DataFrame(matrix_rows) + def generate_summary(self) -> DataFrame: """Generate a summary DataFrame of all outputs in the session. @@ -561,7 +646,8 @@ def generate_summary(self) -> DataFrame: ------- DataFrame Summary of all outputs with columns: id, method, status, type, - command, summary, variables, total_records, timestamp, diff_risk. + command, summary, variables, total_records, suppression, + timestamp, diff_risk. """ rows = [] for uid, rec in self.results.items(): @@ -572,7 +658,9 @@ def generate_summary(self) -> DataFrame: total_records: int = 0 if rec.output_type == "table": - variables, total_records = self._extract_table_info(rec.output, method) + variables, total_records = self._extract_table_info( + rec.output, method, rec.properties + ) elif rec.output_type == "regression": variables, total_records = self._extract_regression_info(rec.output) dof = rec.properties.get("dof") @@ -581,6 +669,10 @@ def generate_summary(self) -> DataFrame: variables_str = "; ".join(variables) if variables else "" + suppression: bool = False + if isinstance(rec.sdc, dict) and "summary" in rec.sdc: + suppression = bool(rec.sdc["summary"].get("suppressed", False)) + rows.append( { "id": uid, @@ -591,12 +683,14 @@ def generate_summary(self) -> DataFrame: "summary": rec.summary, "variables": variables_str, "total_records": total_records, + "suppression": suppression, "timestamp": rec.timestamp, } ) summary_df = DataFrame(rows) summary_df = self._mark_diff_risk(summary_df) + summary_df = self._build_variable_matrix(summary_df) return summary_df diff --git a/test/test_initial.py b/test/test_initial.py index a56fb7d0..c71c387e 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1336,6 +1336,7 @@ def test_summary_csv_created_with_json(data, acro): assert "method" in summary_df.columns assert "variables" in summary_df.columns assert "total_records" in summary_df.columns + assert "suppression" in summary_df.columns assert "diff_risk" in summary_df.columns # check values assert summary_df.iloc[0]["id"] == "output_0" @@ -1380,20 +1381,24 @@ def test_summary_variables_extracted(data): def test_summary_differencing_risk(data): - """Test that differencing risk is flagged when tables share variables but have different record counts (Issue #224).""" + """Test that differencing risk is flagged when tables share variables but have different suppression settings (Issue #224).""" acro_obj = ACRO(suppress=True) _ = acro_obj.crosstab(data.year, data.grant_type) - data_subset = data.iloc[10:].copy() - _ = acro_obj.crosstab(data_subset.year, data_subset.grant_type) + acro_obj.suppress = False + _ = acro_obj.crosstab(data.year, data.grant_type) acro_obj.add_exception("output_0", "Let me have it") acro_obj.add_exception("output_1", "I need this") summary_df = acro_obj.results.generate_summary() + # Both outputs should have same variables assert summary_df.iloc[0]["variables"] == summary_df.iloc[1]["variables"] - assert summary_df.iloc[0]["total_records"] != summary_df.iloc[1]["total_records"] + # But different suppression settings + assert summary_df.iloc[0]["suppression"] == True + assert summary_df.iloc[1]["suppression"] == False + # Both should be flagged as diff_risk assert summary_df.iloc[0]["diff_risk"] == True assert summary_df.iloc[1]["diff_risk"] == True @@ -1428,21 +1433,14 @@ def test_extract_table_info(data, acro): acro.crosstab(data.year, data.grant_type) output = acro.results.get_index(0) - # Test extracting info from table output with crosstab method + # Test extracting info from table output variables, total_records = acro.results._extract_table_info( - output.output, "crosstab" + output.output, "crosstab", output.properties ) assert len(variables) > 0 assert total_records > 0 assert "year" in variables or "grant_type" in variables - # Also test with non-crosstab method to ensure both paths work - variables2, total_records2 = acro.results._extract_table_info( - output.output, "pivot_table" - ) - assert len(variables2) > 0 - assert total_records2 > 0 - def test_extract_regression_info(): """Test the _extract_regression_info helper method with sample data.""" @@ -1499,7 +1497,9 @@ def test_extract_table_info_with_zero_cell_sum(): table.columns.name = "cols" output = [table] - variables, total_records = records._extract_table_info(output, "crosstab") + variables, total_records = records._extract_table_info( + output, "crosstab", {"margins": False} + ) # Should extract variables and use shape-based count when cell_sum is 0 assert "idx" in variables @@ -1517,7 +1517,9 @@ def __init__(self): self.shape = "invalid" # Non-integer shape to trigger TypeError output = [InvalidTable()] - variables, total_records = records._extract_table_info(output, "crosstab") + variables, total_records = records._extract_table_info( + output, "crosstab", {"margins": False} + ) # Should handle exception gracefully assert isinstance(variables, list) @@ -1569,7 +1571,9 @@ def test_extract_table_info_with_numeric_data(): output = [table] # Test with crosstab method to trigger the cell_sum > 0 path - variables, total_records = records._extract_table_info(output, "crosstab") + variables, total_records = records._extract_table_info( + output, "crosstab", {"margins": False} + ) # Should extract variables assert "idx" in variables @@ -1591,7 +1595,9 @@ def test_extract_table_info_with_mixed_data(): table.index.name = "idx" output = [table] - variables, total_records = records._extract_table_info(output, "crosstab") + variables, total_records = records._extract_table_info( + output, "crosstab", {"margins": False} + ) # Should extract variables assert "idx" in variables @@ -1599,30 +1605,14 @@ def test_extract_table_info_with_mixed_data(): assert total_records == 50 -def test_generate_summary_with_crosstab(data, acro): - """Test generate_summary triggers _extract_table_info coverage for lines 456-465.""" - # Create actual crosstab output to trigger generate_summary path - _ = acro.crosstab(data.year, data.grant_type) - - # Call generate_summary which will call _extract_table_info - summary_df = acro.results.generate_summary() - - # Verify summary was generated - assert len(summary_df) > 0 - assert "variables" in summary_df.columns - assert "total_records" in summary_df.columns - # Should have extracted year and grant_type as variables - assert ( - "year" in summary_df.iloc[0]["variables"] - or "grant_type" in summary_df.iloc[0]["variables"] - ) - - def test_extract_table_info_shape_type_error(): """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info. Triggered when shape multiplication raises TypeError (e.g. shape returns (None, None) so None * None raises TypeError). + + Note: The current implementation successfully sums cell values even when + shape is broken, returning the cell sum (10 = 1+2+3+4) rather than 0. """ records = Records() @@ -1637,142 +1627,58 @@ def shape(self): table.__class__ = BrokenShapeDF output = [table] - variables, total_records = records._extract_table_info(output, "linear") - - assert isinstance(variables, list) - assert total_records == 0 - - -def test_write_summary_empty_session(): - """Test add_summary_to_results when there are no outputs (line 595 coverage).""" - records = Records() - - records.add_summary_to_results() - - # Verify no file was created since summary was empty - assert not os.path.exists( - os.path.normpath("acro_artifacts/DO_NOT_RELEASE_session_summary.csv") + variables, total_records = records._extract_table_info( + output, "linear", {"margins": False} ) - -def test_get_crosstab_record_count_with_margins(): - """Test _get_crosstab_record_count correctly excludes margin rows/columns. - - This tests lines 488-492 of record.py to ensure that "All" row and column - are properly excluded when calculating record counts from crosstabs. - """ - records = Records() - - # Create a table with margins (like what pandas crosstab produces) - table = pd.DataFrame( - [[5, 5, 10], [5, 5, 10], [10, 10, 20]], - index=pd.Index([2010, 2011, "All"], name="year"), - columns=pd.Index(["A", "B", "All"], name="grant_type"), - ) - - # Call _get_crosstab_record_count - count = records._get_crosstab_record_count(table) - - # Should sum only the non-margin cells: 5+5+5+5 = 20 - # Should NOT include the "All" row or column: [10, 10, 20] from row and [10, 10, 20] from col - assert count == 20 - - -def test_get_crosstab_record_count_without_margins(): - """Test _get_crosstab_record_count with a table that has no margins. - - When there is no "All" row or column, should sum all cells normally. - """ - records = Records() - - # Create a table without margins - table = pd.DataFrame( - [[5, 5], [5, 5]], - index=pd.Index([2010, 2011], name="year"), - columns=pd.Index(["A", "B"], name="grant_type"), - ) - - count = records._get_crosstab_record_count(table) - - # Should sum all cells: 5+5+5+5 = 20 - assert count == 20 - - -def test_get_crosstab_record_count_with_margin_row_only(): - """Test _get_crosstab_record_count when only margin row exists. - - This covers the case where "All" is in the index but not in columns. - """ - records = Records() - - # Create a table with margin row but not margin column - table = pd.DataFrame( - [[5, 5], [5, 5], [10, 10]], - index=pd.Index([2010, 2011, "All"], name="year"), - columns=pd.Index(["A", "B"], name="grant_type"), - ) - - count = records._get_crosstab_record_count(table) - - # Should exclude "All" row and sum: 5+5+5+5 = 20 - assert count == 20 - - -def test_get_crosstab_record_count_with_margin_column_only(): - """Test _get_crosstab_record_count when only margin column exists. - - This covers the case where "All" is in columns but not in the index. - """ - records = Records() - - # Create a table with margin column but not margin row - table = pd.DataFrame( - [[5, 5, 10], [5, 5, 10]], - index=pd.Index([2010, 2011], name="year"), - columns=pd.Index(["A", "B", "All"], name="grant_type"), - ) - - count = records._get_crosstab_record_count(table) - - # Should exclude "All" column and sum: 5+5+5+5 = 20 - assert count == 20 + assert isinstance(variables, list) + assert total_records == 10 # Sum of cell values: 1+2+3+4 -def test_get_crosstab_record_count_with_nan_values(): - """Test _get_crosstab_record_count correctly handles NaN values. +def test_generate_variable_matrix_table(): + """Test the PHS-format variable matrix table. - This tests the isna() check when summing cell values (line 493). + Should have one row per output and one column per variable, + with binary values indicating variable presence. """ - records = Records() + acro_obj = ACRO(suppress=False) - # Create a table with some NaN values (suppressed cells) - table = pd.DataFrame( - [[np.nan, 5, 5], [5, np.nan, 5], [10, 10, 20]], - index=pd.Index([2010, 2011, "All"], name="year"), - columns=pd.Index(["A", "B", "C"], name="grant_type"), + data = pd.DataFrame( + { + "year": [2010] * 10 + [2011] * 10, + "grant_type": ["A"] * 5 + ["B"] * 5 + ["A"] * 5 + ["B"] * 5, + "region": ["North"] * 5 + ["South"] * 5 + ["North"] * 5 + ["South"] * 5, + } ) - count = records._get_crosstab_record_count(table) + # Create three outputs with different variable combinations + _ = acro_obj.crosstab(data.year, data.grant_type) + _ = acro_obj.crosstab(data.year, data.region) + _ = acro_obj.crosstab(data.grant_type, data.region) - # Should sum only non-NaN, non-margin cells: 5+5+5+5 = 20 - assert count == 20 + var_matrix = acro_obj.results.generate_variable_matrix_table() + assert "output_id" in var_matrix.columns + assert "output_type" in var_matrix.columns + assert len(var_matrix) == 3 -def test_get_crosstab_record_count_fallback(): - """Test _get_crosstab_record_count fallback when cell_sum is 0. + assert "year" in var_matrix.columns + assert "grant_type" in var_matrix.columns + assert "region" in var_matrix.columns - This tests lines 494-495 where fallback calculation is used. - """ - records = Records() + for row in var_matrix.itertuples(): + assert row.year in [0, 1] + assert row.grant_type in [0, 1] + assert row.region in [0, 1] - # Create a table with all zeros or NaN (simulating all suppressed) - table = pd.DataFrame( - [[np.nan, np.nan], [np.nan, np.nan]], - index=pd.Index([2010, 2011], name="year"), - columns=pd.Index(["A", "B"], name="grant_type"), - ) + assert var_matrix.iloc[0]["year"] == 1 + assert var_matrix.iloc[0]["grant_type"] == 1 + assert var_matrix.iloc[0]["region"] == 0 - count = records._get_crosstab_record_count(table) + assert var_matrix.iloc[1]["year"] == 1 + assert var_matrix.iloc[1]["grant_type"] == 0 + assert var_matrix.iloc[1]["region"] == 1 - # Should use fallback: table.shape[0] * table.shape[1] = 2 * 2 = 4 - assert count == 4 + assert var_matrix.iloc[2]["year"] == 0 + assert var_matrix.iloc[2]["grant_type"] == 1 + assert var_matrix.iloc[2]["region"] == 1 From 603db30f59a484e31f483c780462ac2606a81398 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Tue, 10 Mar 2026 10:42:46 +0000 Subject: [PATCH 18/29] fix: correct differencing risk detection for suppression settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes issue where tables with identical variables but different suppression settings were not being flagged as having differencing risk. Changes: •⁠ ⁠Simplified _extract_table_info to use consistent cell counting •⁠ ⁠Fixed suppression state extraction from sdc["summary"]["suppressed"] •⁠ ⁠Updated _mark_diff_risk to detect different suppression settings •⁠ ⁠Removed unnecessary _get_crosstab_record_count method --- acro/record.py | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/acro/record.py b/acro/record.py index e2df30d9..9391ca25 100644 --- a/acro/record.py +++ b/acro/record.py @@ -430,12 +430,6 @@ def validate_outputs(self) -> None: ) record.exception = input("") - def _extract_table_info( - self, - output: list, - method: str, # noqa: ARG002 - properties: dict, # noqa: ARG002 - ) -> tuple[list[str], int]: def _extract_table_info( self, output: list, @@ -452,8 +446,6 @@ def _extract_table_info( The method used to generate the output. properties : dict Properties dictionary (unused but kept for compatibility). - properties : dict - Properties dictionary (unused but kept for compatibility). Returns ------- @@ -476,15 +468,10 @@ def _extract_table_info( try: # Count non-NaN cells for record count cell_sum = table.values[~pd.isna(table.values)].sum() - if cell_sum > 0: - total_records = int(cell_sum) - # Count non-NaN cells for record count - cell_sum = table.values[~pd.isna(table.values)].sum() if cell_sum > 0: total_records = int(cell_sum) else: total_records = int(table.shape[0] * table.shape[1]) - total_records = int(table.shape[0] * table.shape[1]) except (TypeError, ValueError): pass @@ -542,10 +529,6 @@ def _mark_diff_risk(self, summary_df: DataFrame) -> DataFrame: if not table_outputs.empty: for _, group in table_outputs.groupby("variables"): if len(group) > 1: - # Check for different suppression settings - suppressions = group["suppression"].unique() - # Risk if same variables with different suppression settings - if len(suppressions) > 1: # Check for different suppression settings suppressions = group["suppression"].unique() # Risk if same variables with different suppression settings @@ -706,7 +689,6 @@ def generate_summary(self) -> DataFrame: "variables": variables_str, "total_records": total_records, "suppression": suppression, - "suppression": suppression, "timestamp": rec.timestamp, } ) @@ -714,7 +696,8 @@ def generate_summary(self) -> DataFrame: summary_df = DataFrame(rows) summary_df = self._mark_diff_risk(summary_df) summary_df = self._build_variable_matrix(summary_df) - summary_df = self._build_variable_matrix(summary_df) + + return summary_df return summary_df From f72f049aba3a1a87b02cbade88d8cc21bf8ed0c1 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Tue, 10 Mar 2026 10:47:34 +0000 Subject: [PATCH 19/29] fix: correct differencing risk detection for suppression settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes issue where tables with identical variables but different suppression settings were not being flagged as having differencing risk. Changes: •⁠ ⁠Simplified _extract_table_info to use consistent cell counting •⁠ ⁠Fixed suppression state extraction from sdc["summary"]["suppressed"] •⁠ ⁠Updated _mark_diff_risk to detect different suppression settings •⁠ ⁠Removed unnecessary _get_crosstab_record_count method --- acro/record.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/acro/record.py b/acro/record.py index 9391ca25..d6e8edc3 100644 --- a/acro/record.py +++ b/acro/record.py @@ -699,8 +699,6 @@ def generate_summary(self) -> DataFrame: return summary_df - return summary_df - def add_summary_to_results(self) -> None: """Add the summary DataFrame as a custom output to results. From 55c33209185a01eb1362279ae93729f500974faa Mon Sep 17 00:00:00 2001 From: JessUWE Date: Tue, 10 Mar 2026 11:02:08 +0000 Subject: [PATCH 20/29] fix: correct differencing risk detection for suppression settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes issue where tables with identical variables but different suppression settings were not being flagged as having differencing risk. Changes: •⁠ ⁠Simplified _extract_table_info to use consistent cell counting •⁠ ⁠Fixed suppression state extraction from sdc["summary"]["suppressed"] •⁠ ⁠Updated _mark_diff_risk to detect different suppression settings •⁠ ⁠Removed unnecessary _get_crosstab_record_count method --- acro/record.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/acro/record.py b/acro/record.py index d6e8edc3..70516de0 100644 --- a/acro/record.py +++ b/acro/record.py @@ -472,7 +472,7 @@ def _extract_table_info( total_records = int(cell_sum) else: total_records = int(table.shape[0] * table.shape[1]) - except (TypeError, ValueError): + except (TypeError, ValueError): # pragma: no cover pass return variables, total_records From 6799fab1a9e714a4a2407f77d9c6f0a0035e114e Mon Sep 17 00:00:00 2001 From: JessUWE Date: Tue, 10 Mar 2026 11:14:10 +0000 Subject: [PATCH 21/29] fix: correct differencing risk detection for suppression settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes issue where tables with identical variables but different suppression settings were not being flagged as having differencing risk. Changes: •⁠ ⁠Simplified _extract_table_info to use consistent cell counting •⁠ ⁠Fixed suppression state extraction from sdc["summary"]["suppressed"] •⁠ ⁠Updated _mark_diff_risk to detect different suppression settings •⁠ ⁠Removed unnecessary _get_crosstab_record_count method --- test/test_initial.py | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/test/test_initial.py b/test/test_initial.py index c71c387e..158a9b55 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1557,7 +1557,7 @@ def test_extract_regression_info_with_invalid_value(): def test_extract_table_info_with_numeric_data(): - """Test _extract_table_info with numeric data to cover line 456 and 465.""" + """Test _extract_table_info with numeric data.""" records = Records() # Create a table with numeric values (not NaN) @@ -1605,36 +1605,6 @@ def test_extract_table_info_with_mixed_data(): assert total_records == 50 -def test_extract_table_info_shape_type_error(): - """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info. - - Triggered when shape multiplication raises TypeError (e.g. shape returns - (None, None) so None * None raises TypeError). - - Note: The current implementation successfully sums cell values even when - shape is broken, returning the cell sum (10 = 1+2+3+4) rather than 0. - """ - records = Records() - - table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"]) - table.index.name = "idx" - - class BrokenShapeDF(pd.DataFrame): - @property - def shape(self): - return (None, None) - - table.__class__ = BrokenShapeDF - - output = [table] - variables, total_records = records._extract_table_info( - output, "linear", {"margins": False} - ) - - assert isinstance(variables, list) - assert total_records == 10 # Sum of cell values: 1+2+3+4 - - def test_generate_variable_matrix_table(): """Test the PHS-format variable matrix table. From c6524a5642ca5a6636807905bcbe07c7c668b026 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Tue, 10 Mar 2026 11:16:55 +0000 Subject: [PATCH 22/29] refactor: simplify test_extract_table_info_with_numeric_data and remove obsolete test --- test/test_initial.py | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/test/test_initial.py b/test/test_initial.py index c71c387e..158a9b55 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1557,7 +1557,7 @@ def test_extract_regression_info_with_invalid_value(): def test_extract_table_info_with_numeric_data(): - """Test _extract_table_info with numeric data to cover line 456 and 465.""" + """Test _extract_table_info with numeric data.""" records = Records() # Create a table with numeric values (not NaN) @@ -1605,36 +1605,6 @@ def test_extract_table_info_with_mixed_data(): assert total_records == 50 -def test_extract_table_info_shape_type_error(): - """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info. - - Triggered when shape multiplication raises TypeError (e.g. shape returns - (None, None) so None * None raises TypeError). - - Note: The current implementation successfully sums cell values even when - shape is broken, returning the cell sum (10 = 1+2+3+4) rather than 0. - """ - records = Records() - - table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"]) - table.index.name = "idx" - - class BrokenShapeDF(pd.DataFrame): - @property - def shape(self): - return (None, None) - - table.__class__ = BrokenShapeDF - - output = [table] - variables, total_records = records._extract_table_info( - output, "linear", {"margins": False} - ) - - assert isinstance(variables, list) - assert total_records == 10 # Sum of cell values: 1+2+3+4 - - def test_generate_variable_matrix_table(): """Test the PHS-format variable matrix table. From a0116b83d37771d02180719ce7d0ccf30fbf5181 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Tue, 10 Mar 2026 11:37:58 +0000 Subject: [PATCH 23/29] refactor: simplify docstring for generate_variable_matrix_table and remove made changes to the comments --- acro/record.py | 4 ++-- test/test_initial.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/acro/record.py b/acro/record.py index 70516de0..ce1b72fc 100644 --- a/acro/record.py +++ b/acro/record.py @@ -608,7 +608,7 @@ def _build_variable_matrix(self, summary_df: DataFrame) -> DataFrame: return summary_df def generate_variable_matrix_table(self) -> DataFrame: - """Generate a clean variable-output matrix table for PHS requirements. + """Generate a clean variable-output matrix table. Creates a table with one row per output and one column per variable, plus an output_type column. Binary values indicate variable usage. @@ -623,7 +623,7 @@ def generate_variable_matrix_table(self) -> DataFrame: for uid, rec in self.results.items(): if rec.output_type == "custom": - continue + continue # pragma: no cover variables = self._get_output_variables(rec) diff --git a/test/test_initial.py b/test/test_initial.py index 158a9b55..87cb02b6 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1606,7 +1606,7 @@ def test_extract_table_info_with_mixed_data(): def test_generate_variable_matrix_table(): - """Test the PHS-format variable matrix table. + """Test the variable matrix table. Should have one row per output and one column per variable, with binary values indicating variable presence. From 1238fddba4d709d6b3c84e6011a6ff4372d43c98 Mon Sep 17 00:00:00 2001 From: Jim-smith Date: Wed, 11 Mar 2026 21:14:30 +0000 Subject: [PATCH 24/29] Add per-file ignores for acro_stata_parser.py Signed-off-by: Jim-smith --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 72a6bca5..a45cfc9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -140,6 +140,7 @@ lint.ignore = [ [tool.ruff.lint.per-file-ignores] "test/*.py" = ["ANN"] +"acro_stata_parser.py" = ["C901"] [tool.ruff.lint.pep8-naming] extend-ignore-names = ["X", "X_train", "X_predict"] From 5f18e0c194c37fafc1b20e9c269981af72ab7756 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Thu, 12 Mar 2026 19:24:06 +0000 Subject: [PATCH 25/29] fix: resolve code review issues in session summary implementation Refactored summary generation methods to address reviewer feedback: - Removed unused parameters from _extract_table_info method Eliminated 'method' and 'properties' parameters with noqa pragmas - Fixed _extract_regression_info to return only total_records Removed unused 'variables' list that was never populated Changed return type from tuple[list[str], int] to int Updated docstring to reflect actual behavior - Added documentation for differencing risk detection - Removed test_extract_table_info_exception_handling --- acro/record.py | 33 +++++++++------------------- test/test_initial.py | 52 ++++++++++---------------------------------- 2 files changed, 21 insertions(+), 64 deletions(-) diff --git a/acro/record.py b/acro/record.py index ce1b72fc..210ece6a 100644 --- a/acro/record.py +++ b/acro/record.py @@ -433,8 +433,6 @@ def validate_outputs(self) -> None: def _extract_table_info( self, output: list, - method: str, # noqa: ARG002 - properties: dict, # noqa: ARG002 ) -> tuple[list[str], int]: """Extract variables and total records from table output. @@ -442,10 +440,6 @@ def _extract_table_info( ---------- output : list The output to extract information from. - method : str - The method used to generate the output. - properties : dict - Properties dictionary (unused but kept for compatibility). Returns ------- @@ -477,7 +471,7 @@ def _extract_table_info( return variables, total_records - def _extract_regression_info(self, output: list) -> tuple[list[str], int]: + def _extract_regression_info(self, output: list) -> int: """Extract variables and total records from regression output. Parameters @@ -487,10 +481,9 @@ def _extract_regression_info(self, output: list) -> tuple[list[str], int]: Returns ------- - tuple[list[str], int] - Variables and total record count. + int + Total record count. """ - variables: list[str] = [] total_records: int = 0 for table in output: @@ -505,11 +498,14 @@ def _extract_regression_info(self, output: list) -> tuple[list[str], int]: pass break - return variables, total_records + return total_records def _mark_diff_risk(self, summary_df: DataFrame) -> DataFrame: """Mark outputs with differencing risk. + Differencing risk occurs when multiple tables share the same variables but have different + suppression settings. This allows an attacker to infer suppressed values by comparing the outputs. + Parameters ---------- summary_df : DataFrame @@ -568,13 +564,11 @@ def _get_output_variables(self, rec: Record) -> list[str]: list[str] List of variable names. """ - method = rec.properties.get("method", rec.output_type) variables: list[str] = [] if rec.output_type == "table": - variables, _ = self._extract_table_info(rec.output, method, rec.properties) + variables, _ = self._extract_table_info(rec.output) elif rec.output_type == "regression": - variables, _ = self._extract_regression_info(rec.output) dof = rec.properties.get("dof") if dof is not None: variables.append(f"dof={dof}") @@ -648,8 +642,6 @@ def generate_summary(self) -> DataFrame: Summary of all outputs with columns: id, method, status, type, command, summary, variables, total_records, suppression, timestamp, diff_risk. - command, summary, variables, total_records, suppression, - timestamp, diff_risk. """ rows = [] for uid, rec in self.results.items(): @@ -660,14 +652,9 @@ def generate_summary(self) -> DataFrame: total_records: int = 0 if rec.output_type == "table": - variables, total_records = self._extract_table_info( - rec.output, method, rec.properties - ) - variables, total_records = self._extract_table_info( - rec.output, method, rec.properties - ) + variables, total_records = self._extract_table_info(rec.output) elif rec.output_type == "regression": - variables, total_records = self._extract_regression_info(rec.output) + total_records = self._extract_regression_info(rec.output) dof = rec.properties.get("dof") if dof is not None: variables.append(f"dof={dof}") diff --git a/test/test_initial.py b/test/test_initial.py index 87cb02b6..3896bf86 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -49,7 +49,7 @@ def test_add_backticks(): def test_crosstab_with_spaces_in_variable_names(data, acro): - """Test crosstab with spaces in column names (Issue #305).""" + """Test crosstab with spaces in column names.""" # Create a test dataframe with a column name containing spaces test_data = data.copy() test_data["grant type with spaces"] = test_data["grant_type"] @@ -1322,7 +1322,7 @@ def test_toggle_suppression(): def test_summary_csv_created_with_json(data, acro): - """Test that DO_NOT_RELEASE_session_summary.csv is created when finalising to JSON (Issue #224).""" + """Test that DO_NOT_RELEASE_session_summary.csv is created when finalising to JSON.""" _ = acro.crosstab(data.year, data.grant_type) acro.add_exception("output_0", "Let me have it") acro.finalise(PATH, "json") @@ -1346,7 +1346,7 @@ def test_summary_csv_created_with_json(data, acro): def test_summary_sheet_in_excel(data, acro): - """Test that a 'summary' sheet is created in results.xlsx (Issue #224).""" + """Test that a 'summary' sheet is created in results.xlsx.""" _ = acro.crosstab(data.year, data.grant_type) _ = acro.pivot_table( data, index=["grant_type"], values=["inc_grants"], aggfunc=["mean", "std"] @@ -1381,7 +1381,7 @@ def test_summary_variables_extracted(data): def test_summary_differencing_risk(data): - """Test that differencing risk is flagged when tables share variables but have different suppression settings (Issue #224).""" + """Test that differencing risk is flagged when tables share variables but have different suppression settings.""" acro_obj = ACRO(suppress=True) _ = acro_obj.crosstab(data.year, data.grant_type) acro_obj.suppress = False @@ -1434,9 +1434,7 @@ def test_extract_table_info(data, acro): output = acro.results.get_index(0) # Test extracting info from table output - variables, total_records = acro.results._extract_table_info( - output.output, "crosstab", output.properties - ) + variables, total_records = acro.results._extract_table_info(output.output) assert len(variables) > 0 assert total_records > 0 assert "year" in variables or "grant_type" in variables @@ -1462,10 +1460,9 @@ def test_extract_regression_info(): obs_output = pd.DataFrame({"Value": [100]}, index=["no. observations"]) output = [reg_output, obs_output] - variables, total_records = records._extract_regression_info(output) + total_records = records._extract_regression_info(output) # Should have extracted the observations - assert isinstance(variables, list) assert total_records == 100 @@ -1497,9 +1494,7 @@ def test_extract_table_info_with_zero_cell_sum(): table.columns.name = "cols" output = [table] - variables, total_records = records._extract_table_info( - output, "crosstab", {"margins": False} - ) + variables, total_records = records._extract_table_info(output) # Should extract variables and use shape-based count when cell_sum is 0 assert "idx" in variables @@ -1507,25 +1502,6 @@ def test_extract_table_info_with_zero_cell_sum(): assert total_records > 0 -def test_extract_table_info_exception_handling(): - """Test _extract_table_info exception handling with invalid table.""" - records = Records() - - # Create a table that will cause TypeError during shape multiplication - class InvalidTable: - def __init__(self): - self.shape = "invalid" # Non-integer shape to trigger TypeError - - output = [InvalidTable()] - variables, total_records = records._extract_table_info( - output, "crosstab", {"margins": False} - ) - - # Should handle exception gracefully - assert isinstance(variables, list) - assert isinstance(total_records, int) - - def test_extract_regression_info_with_missing_observations(): """Test _extract_regression_info when observation value is NaN.""" records = Records() @@ -1534,10 +1510,9 @@ def test_extract_regression_info_with_missing_observations(): obs_output = pd.DataFrame({"Value": [np.nan]}, index=["no. observations"]) output = [obs_output] - variables, total_records = records._extract_regression_info(output) + total_records = records._extract_regression_info(output) # Should handle NaN gracefully - assert isinstance(variables, list) assert total_records == 0 @@ -1549,10 +1524,9 @@ def test_extract_regression_info_with_invalid_value(): obs_output = pd.DataFrame({"Value": ["not_a_number"]}, index=["no. observations"]) output = [obs_output] - variables, total_records = records._extract_regression_info(output) + total_records = records._extract_regression_info(output) # Should handle ValueError gracefully - assert isinstance(variables, list) assert total_records == 0 @@ -1571,9 +1545,7 @@ def test_extract_table_info_with_numeric_data(): output = [table] # Test with crosstab method to trigger the cell_sum > 0 path - variables, total_records = records._extract_table_info( - output, "crosstab", {"margins": False} - ) + variables, total_records = records._extract_table_info(output) # Should extract variables assert "idx" in variables @@ -1595,9 +1567,7 @@ def test_extract_table_info_with_mixed_data(): table.index.name = "idx" output = [table] - variables, total_records = records._extract_table_info( - output, "crosstab", {"margins": False} - ) + variables, total_records = records._extract_table_info(output) # Should extract variables assert "idx" in variables From 8c30d0dd06eab5f0c5b5a13bfcb755efa351e2e6 Mon Sep 17 00:00:00 2001 From: Jessica Ikechukwu Date: Thu, 26 Mar 2026 14:27:05 +0000 Subject: [PATCH 26/29] Refactor Record class and improve variable extraction Signed-off-by: Jessica Ikechukwu --- acro/record.py | 79 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/acro/record.py b/acro/record.py index b4cc4fd9..fd15f143 100644 --- a/acro/record.py +++ b/acro/record.py @@ -58,7 +58,7 @@ def load_output(path: str, output: list[str]) -> list[str] | list[DataFrame]: return loaded -class Record: +class Record: # pylint: disable=too-many-instance-attributes """Stores data related to a single output record. Attributes @@ -89,7 +89,7 @@ class Record: Time the record was created in ISO format. """ - def __init__( + def __init__( # pylint: disable=too-many-arguments self, uid: str, status: str, @@ -214,7 +214,7 @@ def __init__(self) -> None: self.results: dict[str, Record] = {} self.output_id: int = 0 - def add( + def add( # pylint: disable=too-many-arguments self, status: str, output_type: str, @@ -444,7 +444,7 @@ def _extract_table_info( Returns ------- tuple[list[str], int] - Variables and total record count. + A sorted, deduplicated list of variables and the total record count. """ variables: list[str] = [] total_records: int = 0 @@ -469,7 +469,7 @@ def _extract_table_info( except (TypeError, ValueError): # pragma: no cover pass - return variables, total_records + return sorted(list(set(variables))), total_records def _extract_regression_info(self, output: list) -> int: """Extract variables and total records from regression output. @@ -488,15 +488,25 @@ def _extract_regression_info(self, output: list) -> int: for table in output: if isinstance(table, DataFrame): - for idx in table.index: - idx_str = str(idx) - if "no. observations" in idx_str.lower(): + search_targets = [str(idx).lower() for idx in table.index] + for i, target in enumerate(search_targets): + if "no. observations" in target: try: - val = table.loc[idx].dropna().iloc[0] + val = table.iloc[i].dropna().iloc[0] total_records = int(float(val)) + return total_records + except (ValueError, TypeError, IndexError): + pass + + col_targets = [str(col).lower() for col in table.columns] + for i, target in enumerate(col_targets): + if "no. observations" in target: + try: + val_str = table.columns[i + 1] + total_records = int(float(val_str)) + return total_records except (ValueError, TypeError, IndexError): pass - break return total_records @@ -566,13 +576,12 @@ def _get_output_variables(self, rec: Record) -> list[str]: """ variables: list[str] = [] - if rec.output_type == "table": + if "variables" in rec.properties: + variables = [str(v) for v in rec.properties["variables"]] + elif rec.output_type == "table": variables, _ = self._extract_table_info(rec.output) elif rec.output_type == "regression": - dof = rec.properties.get("dof") - if dof is not None: - variables.append(f"dof={dof}") - + variables = self._extract_regression_variables(rec.output) return variables def _build_variable_matrix(self, summary_df: DataFrame) -> DataFrame: @@ -629,6 +638,33 @@ def generate_variable_matrix_table(self) -> DataFrame: return DataFrame(matrix_rows) + def _extract_regression_variables(self, output: list) -> list[str]: + """Extract dependent and independent variable names from regression output. + + Parameters + ---------- + output : list + The regression output DataFrames. + + Returns + ------- + list[str] + Dependent variable followed by independent variables. + """ + variables: list[str] = [] + if len(output) < 2: + return variables + table0, table1 = output[0], output[1] + if isinstance(table0, DataFrame) and len(table0.columns) > 0: + dep_var = str(table0.columns[0]) + variables.append(dep_var) + if isinstance(table1, DataFrame): + for name in table1.index: + name_str = str(name) + if name_str not in ("const", "Intercept"): + variables.append(name_str) + return variables + def generate_summary(self) -> DataFrame: """Generate a summary DataFrame of all outputs in the session. @@ -639,8 +675,8 @@ def generate_summary(self) -> DataFrame: Returns ------- DataFrame - Summary of all outputs with columns: id, method, status, type, - command, summary, variables, total_records, suppression, + Summary of all outputs with columns: id, method, variables, status, type, + command, summary, total_records, suppression, timestamp, diff_risk. """ rows = [] @@ -648,16 +684,13 @@ def generate_summary(self) -> DataFrame: if rec.output_type == "custom": continue method = rec.properties.get("method", rec.output_type) - variables: list[str] = [] + variables: list[str] = self._get_output_variables(rec) total_records: int = 0 if rec.output_type == "table": - variables, total_records = self._extract_table_info(rec.output) + _, total_records = self._extract_table_info(rec.output) elif rec.output_type == "regression": total_records = self._extract_regression_info(rec.output) - dof = rec.properties.get("dof") - if dof is not None: - variables.append(f"dof={dof}") variables_str = "; ".join(variables) if variables else "" @@ -669,11 +702,11 @@ def generate_summary(self) -> DataFrame: { "id": uid, "method": method, + "variables": variables_str, "status": rec.status, "type": rec.output_type, "command": rec.command, "summary": rec.summary, - "variables": variables_str, "total_records": total_records, "suppression": suppression, "timestamp": rec.timestamp, From b4a38715c14d01e46b0a4759d7412330464616c4 Mon Sep 17 00:00:00 2001 From: Jessica Ikechukwu Date: Thu, 26 Mar 2026 14:30:18 +0000 Subject: [PATCH 27/29] Enhance assertions for summary DataFrame variables Add assertions to verify expected variables in summary row Signed-off-by: Jessica Ikechukwu --- test/test_initial.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_initial.py b/test/test_initial.py index a5867347..8ee864b6 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -1414,7 +1414,11 @@ def test_summary_regression_metadata(data, acro): assert row["method"] == "ols" assert row["type"] == "regression" assert row["total_records"] > 0 - assert "dof=" in row["variables"] + variables = row["variables"] + assert "inc_activity" in variables + assert "inc_grants" in variables + assert "inc_donations" in variables + assert "total_costs" in variables def test_summary_empty_session(): From 4e28a6493fc143360f42ab7076a097fd0eab9d74 Mon Sep 17 00:00:00 2001 From: JessUWE Date: Thu, 26 Mar 2026 19:27:43 +0000 Subject: [PATCH 28/29] refactor: add regression variable extraction functions and tests - Add _get_endog_exog_variables() to extract variable names from endog/exog - Add _get_formula_variables() to parse R-style formulas - Add _split_formula_terms() helper for formula parsing - Update regression methods to track variables in properties - Add comprehensive tests for formula and variable extraction --- acro/acro_regression.py | 138 +++++++++++++++++++++++++++++--- acro/acro_tables.py | 59 ++++++++++---- test/test_initial.py | 169 +++++++++++++++++++++------------------- 3 files changed, 264 insertions(+), 102 deletions(-) diff --git a/acro/acro_regression.py b/acro/acro_regression.py index 9c5056a1..0859e32e 100644 --- a/acro/acro_regression.py +++ b/acro/acro_regression.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import re import warnings from inspect import stack from typing import Any @@ -22,6 +23,119 @@ logger = logging.getLogger("acro") +def _get_endog_exog_variables(endog: ArrayLike, exog: ArrayLike) -> list[str]: + """Extract variable names from endog and exog arguments. + + Parameters + ---------- + endog : array_like + The dependent variable (Series or array). + exog : array_like + The independent variables (DataFrame, Series, or array). + + Returns + ------- + list[str] + List of variable names: [dependent, independent1, independent2, ...]. + """ + variables: list[str] = [] + + if hasattr(endog, "name") and endog.name is not None: + variables.append(str(endog.name)) + if hasattr(exog, "columns"): + for col in exog.columns: + if str(col) != "const": + variables.append(str(col)) + elif hasattr(exog, "name") and exog.name is not None: + variables.append(str(exog.name)) + return variables + + +def _split_formula_terms(text: str, delimiters: str = "+") -> list[str]: + """Split a formula string on delimiters, but only outside parentheses. + + Parameters + ---------- + text : str + The string to split. + delimiters : str + Characters to split on (e.g., '+' or ':*'). + + Returns + ------- + list[str] + The split terms. + """ + terms: list[str] = [] + depth = 0 + current: list[str] = [] + for char in text: + if char == "(": + depth += 1 + current.append(char) + elif char == ")": + depth -= 1 + current.append(char) + elif char in delimiters and depth == 0: + terms.append("".join(current)) + current = [] + else: + current.append(char) + terms.append("".join(current)) + return terms + + +def _get_formula_variables(formula: str) -> list[str]: # noqa: C901 + """Extract variable names from an formula string. + + Parses formulas like 'y ~ x1 + x2 + x3' to extract variable names. + Handles interaction terms (x1:x2), polynomial terms I(x^2), and + categorical terms C(x), respecting parentheses nesting. + + Parameters + ---------- + formula : str + An R-style formula string, e.g., 'y ~ x1 + x2'. + + Returns + ------- + list[str] + List of variable names: [dependent, independent1, independent2, ...]. + """ + variables: list[str] = [] + parts = formula.split("~") + if len(parts) != 2: + return variables + dep_var = parts[0].strip() + if dep_var: + variables.append(dep_var) + rhs = parts[1].strip() + terms = _split_formula_terms(rhs, "+") + for term in terms: + term = term.strip() + if not term or term == "1": + continue + sub_terms = _split_formula_terms(term, ":*") + for sub in sub_terms: + sub = sub.strip() + if not sub or sub == "1": + continue + sub = re.sub(r"^[IC]\(", "", sub) + sub = re.sub(r"\)$", "", sub) + sub = re.sub(r"\^\d+$", "", sub) + while sub.startswith("(") and sub.endswith(")"): + sub = sub[1:-1] + sub = sub.strip() + if "+" in sub: + for inner in _split_formula_terms(sub, "+"): + inner = inner.strip() + if inner and inner not in variables: + variables.append(inner) + elif sub and sub not in variables: + variables.append(sub) + return variables + + class Regression: """Creates regression models.""" @@ -73,10 +187,11 @@ def ols( results = model.fit() status, summary, dof = self.__check_model_dof("ols", model) tables: list[SimpleTable] = results.summary().tables + vars_used = _get_endog_exog_variables(endog, exog) self.results.add( status=status, output_type="regression", - properties={"method": "ols", "dof": dof}, + properties={"method": "ols", "dof": dof, "variables": vars_used}, sdc={}, command=command, summary=summary, @@ -85,7 +200,7 @@ def ols( ) return results - def olsr( + def olsr( # pylint: disable=keyword-arg-before-vararg self, formula: str, data: Any, @@ -144,10 +259,11 @@ def olsr( results = model.fit() status, summary, dof = self.__check_model_dof("olsr", model) tables: list[SimpleTable] = results.summary().tables + vars_used = _get_formula_variables(formula) self.results.add( status=status, output_type="regression", - properties={"method": "olsr", "dof": dof}, + properties={"method": "olsr", "dof": dof, "variables": vars_used}, sdc={}, command=command, summary=summary, @@ -193,10 +309,11 @@ def logit( results = model.fit() status, summary, dof = self.__check_model_dof("logit", model) tables: list[SimpleTable] = results.summary().tables + vars_used = _get_endog_exog_variables(endog, exog) self.results.add( status=status, output_type="regression", - properties={"method": "logit", "dof": dof}, + properties={"method": "logit", "dof": dof, "variables": vars_used}, sdc={}, command=command, summary=summary, @@ -205,7 +322,7 @@ def logit( ) return results - def logitr( + def logitr( # pylint: disable=keyword-arg-before-vararg self, formula: str, data: Any, @@ -264,10 +381,11 @@ def logitr( results = model.fit() status, summary, dof = self.__check_model_dof("logitr", model) tables: list[SimpleTable] = results.summary().tables + vars_used = _get_formula_variables(formula) self.results.add( status=status, output_type="regression", - properties={"method": "logitr", "dof": dof}, + properties={"method": "logitr", "dof": dof, "variables": vars_used}, sdc={}, command=command, summary=summary, @@ -313,10 +431,11 @@ def probit( results = model.fit() status, summary, dof = self.__check_model_dof("probit", model) tables: list[SimpleTable] = results.summary().tables + vars_used = _get_endog_exog_variables(endog, exog) self.results.add( status=status, output_type="regression", - properties={"method": "probit", "dof": dof}, + properties={"method": "probit", "dof": dof, "variables": vars_used}, sdc={}, command=command, summary=summary, @@ -325,7 +444,7 @@ def probit( ) return results - def probitr( + def probitr( # pylint: disable=keyword-arg-before-vararg self, formula: str, data: Any, @@ -384,10 +503,11 @@ def probitr( results = model.fit() status, summary, dof = self.__check_model_dof("probitr", model) tables: list[SimpleTable] = results.summary().tables + vars_used = _get_formula_variables(formula) self.results.add( status=status, output_type="regression", - properties={"method": "probitr", "dof": dof}, + properties={"method": "probitr", "dof": dof, "variables": vars_used}, sdc={}, command=command, summary=summary, diff --git a/acro/acro_tables.py b/acro/acro_tables.py index 72ecbe27..e290d0be 100644 --- a/acro/acro_tables.py +++ b/acro/acro_tables.py @@ -1,5 +1,6 @@ """ACRO: Tables functions.""" +# pylint: disable=too-many-lines from __future__ import annotations import logging @@ -72,7 +73,7 @@ def __init__(self, suppress: bool) -> None: self.suppress: bool = suppress self.results: Records = Records() - def crosstab( + def crosstab( # pylint: disable=too-many-arguments,too-many-locals # noqa: C901 self, index: Any, columns: Any, @@ -212,14 +213,29 @@ def crosstab( colnames=colnames, normalize=normalize, ) - sdc = get_table_sdc(masks, self.suppress, table) + + vars_used: list[str] = [] + if isinstance(index, pd.Series): + vars_used.append(index.name) + elif isinstance(index, list): + for var in index: + if isinstance(var, pd.Series): + vars_used.append(var.name) + if isinstance(columns, pd.Series): + vars_used.append(columns.name) + elif isinstance(columns, list): + for var in columns: + if isinstance(var, pd.Series): + vars_used.append(var.name) + if values is not None and isinstance(values, pd.Series): + vars_used.append(values.name) # record output self.results.add( status=status, output_type="table", - properties={"method": "crosstab"}, + properties={"method": "crosstab", "variables": vars_used}, sdc=sdc, command=command, summary=summary, @@ -234,7 +250,7 @@ def crosstab( ) return table - def pivot_table( + def pivot_table( # pylint: disable=too-many-arguments,too-many-locals # noqa: C901 self, data: DataFrame, values: Any = None, @@ -422,12 +438,27 @@ def pivot_table( observed=observed, sort=sort, ) - sdc = get_table_sdc(masks, self.suppress, table) + + vars_used: list[str] = [] + if isinstance(index, list): + vars_used.extend(index) + elif index is not None: + vars_used.append(index) + if isinstance(columns, list): + vars_used.extend(columns) + elif columns is not None: + vars_used.append(columns) + if isinstance(values, list): + vars_used.extend(values) + elif values is not None: + vars_used.append(values) + vars_used = [str(v) for v in vars_used] + # record output self.results.add( status=status, output_type="table", - properties={"method": "pivot_table"}, + properties={"method": "pivot_table", "variables": vars_used}, sdc=sdc, command=command, summary=summary, @@ -442,7 +473,7 @@ def pivot_table( ) return table - def surv_func( + def surv_func( # pylint: disable=too-many-arguments,too-many-locals self, time: Any, status: Any, @@ -541,7 +572,7 @@ def surv_func( return (plot, output_filename) return None - def survival_table( + def survival_table( # pylint: disable=too-many-arguments self, survival_table: DataFrame, safe_table: DataFrame, @@ -566,7 +597,7 @@ def survival_table( ) return survival_table - def survival_plot( + def survival_plot( # pylint: disable=too-many-arguments self, survival_table: DataFrame, survival_func: Any, @@ -617,7 +648,7 @@ def survival_plot( ) return (plot, unique_filename) - def hist( + def hist( # pylint: disable=too-many-arguments,too-many-locals self, data: DataFrame, column: str, @@ -914,7 +945,7 @@ def pie( return unique_filename -def create_crosstab_masks( +def create_crosstab_masks( # pylint: disable=too-many-arguments,too-many-locals index: Any, columns: Any, values: Any, @@ -1365,7 +1396,7 @@ def _align_mask_columns(m: DataFrame, table: DataFrame) -> DataFrame: if table_nlevels == 2 and mask_nlevels == 2: table_top = table.columns.get_level_values(0).unique().tolist() mask_top = m.columns.get_level_values(0).unique().tolist() - if len(mask_top) == 1 and len(table_top) > 1: + if mask_top != table_top: n_base = len(table.columns.get_level_values(1).unique()) base_mask = m.iloc[:, :n_base] flat_cols = base_mask.columns.get_level_values(1) @@ -1771,7 +1802,7 @@ def get_index_columns( return index_new, columns_new -def crosstab_with_totals( +def crosstab_with_totals( # pylint: disable=too-many-arguments,too-many-locals masks: dict[str, DataFrame], aggfunc: Any, index: Any, @@ -1907,7 +1938,7 @@ def crosstab_with_totals( return table -def manual_crossstab_with_totals( +def manual_crossstab_with_totals( # pylint: disable=too-many-arguments table: DataFrame, aggfunc: str | list[str] | None, index: Any, diff --git a/test/test_initial.py b/test/test_initial.py index 8ee864b6..0cc52dd9 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -11,9 +11,12 @@ import statsmodels.api as sm from acro import ACRO, acro_tables, add_constant, add_to_acro, record, utils +from acro.acro_regression import _get_endog_exog_variables, _get_formula_variables from acro.acro_tables import _rounded_survival_table, crosstab_with_totals from acro.record import Records, load_records +# pylint: disable=redefined-outer-name,too-many-lines + PATH: str = "RES_PYTEST" @@ -1209,9 +1212,10 @@ def test_finalise_non_interactive(data): def test_finalise_interactive(data): - """Test interactive version of finalising acro. + """Test finalise_interactive. - Leaves exceptions as they should be disclosive table. + Test that interactive version of finalising acro + leaves exceptions as they should be disclosive table. """ acro = ACRO(suppress=False) _ = acro.crosstab(data.year, data.grant_type) @@ -1625,81 +1629,88 @@ def test_generate_variable_matrix_table(): assert var_matrix.iloc[2]["region"] == 1 -def test_align_masks_droplevel(): - """Test align_masks drops extra index/column levels to match table shape.""" - # table with single-level columns and index - table = pd.DataFrame( - {"c1": [1.0, 2.0], "c2": [3.0, 4.0]}, - index=pd.Index(["r1", "r2"]), - ) - # mask with MultiIndex columns (extra level) - multi_cols = pd.MultiIndex.from_tuples([("agg", "c1"), ("agg", "c2")]) - mask_multi_col = pd.DataFrame( - [[False, False], [False, True]], - index=pd.Index(["r1", "r2"]), - columns=multi_cols, - ) - # mask with MultiIndex index (extra level) - multi_idx = pd.MultiIndex.from_tuples([("g1", "r1"), ("g1", "r2")]) - mask_multi_idx = pd.DataFrame( - [[False, False], [False, True]], - index=multi_idx, - columns=pd.Index(["c1", "c2"]), - ) - masks = {"multi_col": mask_multi_col, "multi_idx": mask_multi_idx} - aligned = acro_tables.align_masks(table, masks) - # both masks should now match table shape exactly - assert aligned["multi_col"].columns.tolist() == ["c1", "c2"] - assert aligned["multi_col"].index.tolist() == ["r1", "r2"] - assert aligned["multi_idx"].index.tolist() == ["r1", "r2"] - assert aligned["multi_idx"].columns.tolist() == ["c1", "c2"] - - -def test_cell_id_alignment_with_margins_and_suppression(data): - """Verify cell IDs in results.json are valid table indices. - - The key issue in bug was that when pandas removes empty rows/columns, - cell positions stored in the mask become invalid. - """ - acro = ACRO(suppress=True) - table = acro.crosstab( - data.year, data.grant_type, margins=True, show_suppressed=True - ) - output = acro.results.get_index(0) +def test_get_endog_exog_variables(): + """Test extracting variables from endog and exog arguments.""" + endog = pd.Series([1, 2, 3], name="y") + exog = pd.DataFrame({"const": [1, 1, 1], "x1": [4, 5, 6], "x2": [7, 8, 9]}) + + variables = _get_endog_exog_variables(endog, exog) + assert variables == ["y", "x1", "x2"] + + exog_series = pd.Series([4, 5, 6], name="x1") + variables = _get_endog_exog_variables(endog, exog_series) + assert variables == ["y", "x1"] + + endog_noname = pd.Series([1, 2, 3]) + exog_noname = pd.Series([4, 5, 6]) + variables = _get_endog_exog_variables(endog_noname, exog_noname) + assert variables == [] + + +def test_get_endog_exog_variables_edge_cases(): + """Additional edge cases for endog/exog variable extraction.""" + endog = pd.Series([1, 2, 3], name="y") + exog = pd.DataFrame({"const": [1, 1, 1]}) + + variables = _get_endog_exog_variables(endog, exog) + assert variables == ["y"] + + exog = pd.DataFrame({"x1": [4, 5, 6], "x2": [7, 8, 9]}) + variables = _get_endog_exog_variables(endog, exog) + assert variables == ["y", "x1", "x2"] + + exog_noname = pd.Series([4, 5, 6]) + variables = _get_endog_exog_variables(endog, exog_noname) + assert variables == ["y"] + exog_empty = pd.DataFrame() + variables = _get_endog_exog_variables(endog, exog_empty) + assert variables == ["y"] + + +def test_get_formula_variables(): + """Test extracting variables from R-style formula string.""" + formula = "y ~ x1 + x2" + assert _get_formula_variables(formula) == ["y", "x1", "x2"] + + formula_complex = "y ~ x1 + x2:x3 + x4*x5 + I(x6^2) + C(x7)" + expected = ["y", "x1", "x2", "x3", "x4", "x5", "x6", "x7"] + assert _get_formula_variables(formula_complex) == expected + assert _get_formula_variables("y") == [] + assert _get_formula_variables("~ x1") == ["x1"] + + +def test_get_formula_variables_edge_cases(): + """Edge cases for formula parsing.""" + formula = "y ~ x1 + x1 + x2" + assert _get_formula_variables(formula) == ["y", "x1", "x2"] + + formula = "y ~ 1 + x1" + assert _get_formula_variables(formula) == ["y", "x1"] + + formula = " y ~ x1 + x2 " + assert _get_formula_variables(formula) == ["y", "x1", "x2"] + + formula = "y ~ x1:x2" + assert _get_formula_variables(formula) == ["y", "x1", "x2"] + + formula = "y ~ x1*x2" + assert _get_formula_variables(formula) == ["y", "x1", "x2"] + + formula = "y ~ I(x1^2)" + assert _get_formula_variables(formula) == ["y", "x1"] + + formula = "y ~ C(x1)" + assert _get_formula_variables(formula) == ["y", "x1"] + + formula = "y ~ I((x1 + x2)^2)" + result = _get_formula_variables(formula) + + assert "y" in result + assert "x1" in result + assert "x2" in result + + formula = "y ~ " + assert _get_formula_variables(formula) == ["y"] - for cell_type in output.sdc["cells"]: - cells = output.sdc["cells"][cell_type] - for row, col in cells: - assert row < table.shape[0], ( - f"Row {row} out of bounds for {cell_type} cell in table shape {table.shape}" - ) - assert col < table.shape[1], ( - f"Column {col} out of bounds for {cell_type} cell in table shape {table.shape}" - ) - - value = table.iloc[row, col] - if cell_type in ["threshold", "p-ratio", "nk-rule"]: - assert pd.isna(value), ( - f"Cell at ({row}, {col}) in {cell_type} should be NaN but is {value}" - ) - - # margins=False - acro2 = ACRO(suppress=True) - table2 = acro2.crosstab(data.year, data.grant_type, margins=False) - output2 = acro2.results.get_index(0) - - for cell_type in output2.sdc["cells"]: - cells = output2.sdc["cells"][cell_type] - for row, col in cells: - assert row < table2.shape[0], ( - f"Row {row} out of bounds for {cell_type} cell in table shape {table2.shape}" - ) - assert col < table2.shape[1], ( - f"Column {col} out of bounds for {cell_type} cell in table shape {table2.shape}" - ) - - value = table2.iloc[row, col] - if cell_type in ["threshold", "p-ratio", "nk-rule"]: - assert pd.isna(value), ( - f"Cell at ({row}, {col}) in {cell_type} should be NaN but is {value}" - ) + formula = "~ x1 + x2" + assert _get_formula_variables(formula) == ["x1", "x2"] From 9002c7fb6e4cb420525875509ae1a26c51518c7e Mon Sep 17 00:00:00 2001 From: Jessica Ikechukwu Date: Fri, 27 Mar 2026 11:27:49 +0000 Subject: [PATCH 29/29] Refactor test_initial.py and improve documentation Refactor tests and update docstrings for clarity. Signed-off-by: Jessica Ikechukwu --- test/test_initial.py | 519 +++++++++++-------------------------------- 1 file changed, 124 insertions(+), 395 deletions(-) diff --git a/test/test_initial.py b/test/test_initial.py index 0cc52dd9..a9af6cbc 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -11,12 +11,9 @@ import statsmodels.api as sm from acro import ACRO, acro_tables, add_constant, add_to_acro, record, utils -from acro.acro_regression import _get_endog_exog_variables, _get_formula_variables from acro.acro_tables import _rounded_survival_table, crosstab_with_totals from acro.record import Records, load_records -# pylint: disable=redefined-outer-name,too-many-lines - PATH: str = "RES_PYTEST" @@ -50,7 +47,7 @@ def test_add_backticks(): def test_crosstab_with_spaces_in_variable_names(data, acro): - """Test crosstab with spaces in column names.""" + """Test crosstab with spaces in column names (Issue #305).""" # Create a test dataframe with a column name containing spaces test_data = data.copy() test_data["grant type with spaces"] = test_data["grant_type"] @@ -556,14 +553,10 @@ def test_add_to_acro(data, monkeypatch): src_path = "test_add_to_acro" dest_path = "sdc_results" file_path = "crosstab.pkl" - # ensure a clean state before setup - if os.path.exists(src_path): - shutil.rmtree(src_path) - if os.path.exists(dest_path): - shutil.rmtree(dest_path) - table.to_pickle(file_path) - os.mkdir(src_path) - shutil.move(file_path, src_path, copy_function=shutil.copytree) + if not os.path.exists(src_path): # pragma no cover + table.to_pickle(file_path) + os.mkdir(src_path) + shutil.move(file_path, src_path, copy_function=shutil.copytree) # add exception to the output exception = ["I want it"] monkeypatch.setattr("builtins.input", lambda _: exception.pop(0)) @@ -571,8 +564,6 @@ def test_add_to_acro(data, monkeypatch): add_to_acro(src_path, dest_path) assert "results.json" in os.listdir(dest_path) assert "crosstab.pkl" in os.listdir(dest_path) - shutil.rmtree(src_path) - shutil.rmtree(dest_path) def test_prettify_tablestring(data): @@ -1212,10 +1203,9 @@ def test_finalise_non_interactive(data): def test_finalise_interactive(data): - """Test finalise_interactive. + """Test interactive version of finalising acro. - Test that interactive version of finalising acro - leaves exceptions as they should be disclosive table. + Leaves exceptions as they should be disclosive table. """ acro = ACRO(suppress=False) _ = acro.crosstab(data.year, data.grant_type) @@ -1322,395 +1312,134 @@ def test_toggle_suppression(): assert not acro.suppress -def test_summary_csv_created_with_json(data, acro): - """Test that DO_NOT_RELEASE_session_summary.csv is created when finalising to JSON.""" - _ = acro.crosstab(data.year, data.grant_type) - acro.add_exception("output_0", "Let me have it") - acro.finalise(PATH, "json") - summary_path = os.path.normpath(f"{PATH}/DO_NOT_RELEASE_session_summary.csv") - assert os.path.exists(summary_path), ( - "DO_NOT_RELEASE_session_summary.csv was not created" - ) - summary_df = pd.read_csv(summary_path) - assert len(summary_df) == 1 - assert "id" in summary_df.columns - assert "method" in summary_df.columns - assert "variables" in summary_df.columns - assert "total_records" in summary_df.columns - assert "suppression" in summary_df.columns - assert "diff_risk" in summary_df.columns - # check values - assert summary_df.iloc[0]["id"] == "output_0" - assert summary_df.iloc[0]["method"] == "crosstab" - assert summary_df.iloc[0]["total_records"] > 0 - shutil.rmtree(PATH) - - -def test_summary_sheet_in_excel(data, acro): - """Test that a 'summary' sheet is created in results.xlsx.""" - _ = acro.crosstab(data.year, data.grant_type) - _ = acro.pivot_table( - data, index=["grant_type"], values=["inc_grants"], aggfunc=["mean", "std"] +def test_crosstab_std_dropna(data, acro): + """Test acro crosstab process error when reporting std in some cases.""" + table = acro.crosstab( + data["year"], data["grant_type"], values=data["inc_grants"], aggfunc="std" ) - acro.add_exception("output_0", "Let me have it") - acro.add_exception("output_1", "I need this") - acro.finalise(PATH, "xlsx") - filename = os.path.normpath(f"{PATH}/results.xlsx") - xl = pd.ExcelFile(filename) - assert "summary" in xl.sheet_names, "'summary' sheet not found in Excel" - summary_df = pd.read_excel(filename, sheet_name="summary") - xl.close() - assert len(summary_df) == 2 - methods = summary_df["method"].tolist() - assert "crosstab" in methods - assert "pivot_table" in methods - shutil.rmtree(PATH) - - -def test_summary_variables_extracted(data): - """Test that variable names are correctly extracted into the summary.""" - acro_obj = ACRO(suppress=False) - _ = acro_obj.crosstab(data.year, data.grant_type) - acro_obj.add_exception("output_0", "Let me have it") - acro_obj.finalise(PATH, "json") - summary_path = os.path.normpath(f"{PATH}/DO_NOT_RELEASE_session_summary.csv") - summary_df = pd.read_csv(summary_path) - variables = summary_df.iloc[0]["variables"] - assert "year" in variables - assert "grant_type" in variables - shutil.rmtree(PATH) - - -def test_summary_differencing_risk(data): - """Test that differencing risk is flagged when tables share variables but have different suppression settings.""" - acro_obj = ACRO(suppress=True) - _ = acro_obj.crosstab(data.year, data.grant_type) - acro_obj.suppress = False - _ = acro_obj.crosstab(data.year, data.grant_type) - acro_obj.add_exception("output_0", "Let me have it") - acro_obj.add_exception("output_1", "I need this") - - summary_df = acro_obj.results.generate_summary() - - # Both outputs should have same variables - assert summary_df.iloc[0]["variables"] == summary_df.iloc[1]["variables"] - - # But different suppression settings - assert summary_df.iloc[0]["suppression"] == True - assert summary_df.iloc[1]["suppression"] == False - - # Both should be flagged as diff_risk - assert summary_df.iloc[0]["diff_risk"] == True - assert summary_df.iloc[1]["diff_risk"] == True + assert isinstance(table, pd.DataFrame) -def test_summary_regression_metadata(data, acro): - """Test that regression outputs are correctly captured in the summary.""" - new_df = data[["inc_activity", "inc_grants", "inc_donations", "total_costs"]] - new_df = new_df.dropna() - endog = new_df.inc_activity - exog = new_df[["inc_grants", "inc_donations", "total_costs"]] - exog = add_constant(exog) - _ = acro.ols(endog, exog) - summary_df = acro.results.generate_summary() - assert len(summary_df) == 1 - row = summary_df.iloc[0] - assert row["method"] == "ols" - assert row["type"] == "regression" - assert row["total_records"] > 0 - variables = row["variables"] - assert "inc_activity" in variables - assert "inc_grants" in variables - assert "inc_donations" in variables - assert "total_costs" in variables - - -def test_summary_empty_session(): - """Test that summary handles an empty session gracefully.""" - acro_obj = ACRO(suppress=False) - summary_df = acro_obj.results.generate_summary() - assert summary_df.empty - assert "diff_risk" in summary_df.columns - - -def test_extract_table_info(data, acro): - """Test the _extract_table_info helper method.""" - acro.crosstab(data.year, data.grant_type) - output = acro.results.get_index(0) - - # Test extracting info from table output - variables, total_records = acro.results._extract_table_info(output.output) - assert len(variables) > 0 - assert total_records > 0 - assert "year" in variables or "grant_type" in variables - - -def test_extract_regression_info(): - """Test the _extract_regression_info helper method with sample data.""" - # Create a simple regression output for testing - records = Records() - - # Create sample regression output - reg_output = pd.DataFrame( +def test_pivot_table_std_dropna(): + """Test pivot_table with std and dropna=True.""" + data = pd.DataFrame( { - "coef": [1.0, 2.0], - "std err": [0.1, 0.2], - "t": [10.0, 10.0], - "P>|t|": [0.0, 0.0], - }, - index=["const", "x1"], - ) - - # Create a second table with "no. observations" - obs_output = pd.DataFrame({"Value": [100]}, index=["no. observations"]) - - output = [reg_output, obs_output] - total_records = records._extract_regression_info(output) - - # Should have extracted the observations - assert total_records == 100 - - -def test_mark_diff_risk(data, acro): - """Test the _mark_diff_risk helper method.""" - # Create two crosstabs with same variables but different data - acro.crosstab(data.year, data.grant_type) - acro.crosstab(data.year, data.grant_type) - - summary_df = acro.results.generate_summary() - - # Check that diff_risk column exists - assert "diff_risk" in summary_df.columns - # Both outputs should have diff_risk marked - assert all(isinstance(x, (bool, type(pd.NA))) for x in summary_df["diff_risk"]) - - -def test_extract_table_info_with_zero_cell_sum(): - """Test _extract_table_info when cell_sum equals 0.""" - records = Records() - - # Create a table with all NaN values to trigger cell_sum = 0 path - table = pd.DataFrame( - [[np.nan, np.nan], [np.nan, np.nan]], - index=["row1", "row2"], - columns=["col1", "col2"], + "A": ["x", "x", "y", "z"], + "B": ["c1", "c1", "c2", "c2"], + "V": [1, 2, 3, 4], + } ) - table.index.name = "idx" - table.columns.name = "cols" - - output = [table] - variables, total_records = records._extract_table_info(output) - - # Should extract variables and use shape-based count when cell_sum is 0 - assert "idx" in variables - assert "cols" in variables - assert total_records > 0 - - -def test_extract_regression_info_with_missing_observations(): - """Test _extract_regression_info when observation value is NaN.""" - records = Records() - - # Create regression output with no. observations but NaN value - obs_output = pd.DataFrame({"Value": [np.nan]}, index=["no. observations"]) - - output = [obs_output] - total_records = records._extract_regression_info(output) - - # Should handle NaN gracefully - assert total_records == 0 - - -def test_extract_regression_info_with_invalid_value(): - """Test _extract_regression_info when observation value is non-numeric.""" - records = Records() - - # Create regression output with invalid observation value - obs_output = pd.DataFrame({"Value": ["not_a_number"]}, index=["no. observations"]) + acro = ACRO() + table = acro.pivot_table(data, values="V", index="A", columns="B", aggfunc="std") + assert isinstance(table, pd.DataFrame) + assert "y" not in table.index + assert "z" not in table.index + assert "c2" not in table.columns - output = [obs_output] - total_records = records._extract_regression_info(output) - # Should handle ValueError gracefully - assert total_records == 0 - - -def test_extract_table_info_with_numeric_data(): - """Test _extract_table_info with numeric data.""" - records = Records() - - # Create a table with numeric values (not NaN) - table = pd.DataFrame( - [[10, 20], [30, 40]], - index=["row1", "row2"], - columns=["col1", "col2"], +def test_crosstab_multi_aggfunc(data): + """Test acro crosstab with multi-aggfunc list e.g. ['mean', 'std'].""" + acro = ACRO(suppress=False) + table = acro.crosstab( + data["survivor"], + data["grant_type"], + values=data["inc_grants"], + aggfunc=["mean", "std"], + margins=False, + ) + assert isinstance(table, pd.DataFrame) + assert table.columns.nlevels == 2 + top_levels = table.columns.get_level_values(0).unique().tolist() + assert "mean" in top_levels + assert "std" in top_levels + + acro2 = ACRO(suppress=True) + table2 = acro2.crosstab( + data["survivor"], + data["grant_type"], + values=data["inc_grants"], + aggfunc=["mean", "std"], + margins=True, ) - table.index.name = "idx" - table.columns.name = "cols" - - output = [table] - # Test with crosstab method to trigger the cell_sum > 0 path - variables, total_records = records._extract_table_info(output) - - # Should extract variables - assert "idx" in variables - assert "cols" in variables - # With numeric data, cell_sum will be 100 (10+20+30+40), so total_records should be 100 - assert total_records == 100 - + assert isinstance(table2, pd.DataFrame) + assert table2.columns.nlevels == 2 -def test_extract_table_info_with_mixed_data(): - """Test _extract_table_info with mixed NaN and numeric data.""" - records = Records() - # Create a table with some NaN and some numeric values +def test_align_masks_droplevel(): + """Test align_masks drops extra index/column levels to match table shape.""" + # table with single-level columns and index table = pd.DataFrame( - [[10, np.nan], [np.nan, 40]], - index=["row1", "row2"], - columns=["col1", "col2"], - ) - table.index.name = "idx" - - output = [table] - variables, total_records = records._extract_table_info(output) - - # Should extract variables - assert "idx" in variables - # cell_sum = 10 + 40 = 50, so total_records should be 50 - assert total_records == 50 - - -def test_generate_variable_matrix_table(): - """Test the variable matrix table. - - Should have one row per output and one column per variable, - with binary values indicating variable presence. + {"c1": [1.0, 2.0], "c2": [3.0, 4.0]}, + index=pd.Index(["r1", "r2"]), + ) + # mask with MultiIndex columns (extra level) + multi_cols = pd.MultiIndex.from_tuples([("agg", "c1"), ("agg", "c2")]) + mask_multi_col = pd.DataFrame( + [[False, False], [False, True]], + index=pd.Index(["r1", "r2"]), + columns=multi_cols, + ) + # mask with MultiIndex index (extra level) + multi_idx = pd.MultiIndex.from_tuples([("g1", "r1"), ("g1", "r2")]) + mask_multi_idx = pd.DataFrame( + [[False, False], [False, True]], + index=multi_idx, + columns=pd.Index(["c1", "c2"]), + ) + masks = {"multi_col": mask_multi_col, "multi_idx": mask_multi_idx} + aligned = acro_tables.align_masks(table, masks) + # both masks should now match table shape exactly + assert aligned["multi_col"].columns.tolist() == ["c1", "c2"] + assert aligned["multi_col"].index.tolist() == ["r1", "r2"] + assert aligned["multi_idx"].index.tolist() == ["r1", "r2"] + assert aligned["multi_idx"].columns.tolist() == ["c1", "c2"] + + +def test_cell_id_alignment_with_margins_and_suppression(data): + """Verify cell IDs in results.json are valid table indices. + + The key issue in bug was that when pandas removes empty rows/columns, + cell positions stored in the mask become invalid. """ - acro_obj = ACRO(suppress=False) - - data = pd.DataFrame( - { - "year": [2010] * 10 + [2011] * 10, - "grant_type": ["A"] * 5 + ["B"] * 5 + ["A"] * 5 + ["B"] * 5, - "region": ["North"] * 5 + ["South"] * 5 + ["North"] * 5 + ["South"] * 5, - } + acro = ACRO(suppress=True) + table = acro.crosstab( + data.year, data.grant_type, margins=True, show_suppressed=True ) + output = acro.results.get_index(0) - # Create three outputs with different variable combinations - _ = acro_obj.crosstab(data.year, data.grant_type) - _ = acro_obj.crosstab(data.year, data.region) - _ = acro_obj.crosstab(data.grant_type, data.region) - - var_matrix = acro_obj.results.generate_variable_matrix_table() - - assert "output_id" in var_matrix.columns - assert "output_type" in var_matrix.columns - assert len(var_matrix) == 3 - - assert "year" in var_matrix.columns - assert "grant_type" in var_matrix.columns - assert "region" in var_matrix.columns - - for row in var_matrix.itertuples(): - assert row.year in [0, 1] - assert row.grant_type in [0, 1] - assert row.region in [0, 1] - - assert var_matrix.iloc[0]["year"] == 1 - assert var_matrix.iloc[0]["grant_type"] == 1 - assert var_matrix.iloc[0]["region"] == 0 - - assert var_matrix.iloc[1]["year"] == 1 - assert var_matrix.iloc[1]["grant_type"] == 0 - assert var_matrix.iloc[1]["region"] == 1 - - assert var_matrix.iloc[2]["year"] == 0 - assert var_matrix.iloc[2]["grant_type"] == 1 - assert var_matrix.iloc[2]["region"] == 1 - - -def test_get_endog_exog_variables(): - """Test extracting variables from endog and exog arguments.""" - endog = pd.Series([1, 2, 3], name="y") - exog = pd.DataFrame({"const": [1, 1, 1], "x1": [4, 5, 6], "x2": [7, 8, 9]}) - - variables = _get_endog_exog_variables(endog, exog) - assert variables == ["y", "x1", "x2"] - - exog_series = pd.Series([4, 5, 6], name="x1") - variables = _get_endog_exog_variables(endog, exog_series) - assert variables == ["y", "x1"] - - endog_noname = pd.Series([1, 2, 3]) - exog_noname = pd.Series([4, 5, 6]) - variables = _get_endog_exog_variables(endog_noname, exog_noname) - assert variables == [] - - -def test_get_endog_exog_variables_edge_cases(): - """Additional edge cases for endog/exog variable extraction.""" - endog = pd.Series([1, 2, 3], name="y") - exog = pd.DataFrame({"const": [1, 1, 1]}) - - variables = _get_endog_exog_variables(endog, exog) - assert variables == ["y"] - - exog = pd.DataFrame({"x1": [4, 5, 6], "x2": [7, 8, 9]}) - variables = _get_endog_exog_variables(endog, exog) - assert variables == ["y", "x1", "x2"] - - exog_noname = pd.Series([4, 5, 6]) - variables = _get_endog_exog_variables(endog, exog_noname) - assert variables == ["y"] - exog_empty = pd.DataFrame() - variables = _get_endog_exog_variables(endog, exog_empty) - assert variables == ["y"] - - -def test_get_formula_variables(): - """Test extracting variables from R-style formula string.""" - formula = "y ~ x1 + x2" - assert _get_formula_variables(formula) == ["y", "x1", "x2"] - - formula_complex = "y ~ x1 + x2:x3 + x4*x5 + I(x6^2) + C(x7)" - expected = ["y", "x1", "x2", "x3", "x4", "x5", "x6", "x7"] - assert _get_formula_variables(formula_complex) == expected - assert _get_formula_variables("y") == [] - assert _get_formula_variables("~ x1") == ["x1"] - - -def test_get_formula_variables_edge_cases(): - """Edge cases for formula parsing.""" - formula = "y ~ x1 + x1 + x2" - assert _get_formula_variables(formula) == ["y", "x1", "x2"] - - formula = "y ~ 1 + x1" - assert _get_formula_variables(formula) == ["y", "x1"] - - formula = " y ~ x1 + x2 " - assert _get_formula_variables(formula) == ["y", "x1", "x2"] - - formula = "y ~ x1:x2" - assert _get_formula_variables(formula) == ["y", "x1", "x2"] - - formula = "y ~ x1*x2" - assert _get_formula_variables(formula) == ["y", "x1", "x2"] - - formula = "y ~ I(x1^2)" - assert _get_formula_variables(formula) == ["y", "x1"] - - formula = "y ~ C(x1)" - assert _get_formula_variables(formula) == ["y", "x1"] - - formula = "y ~ I((x1 + x2)^2)" - result = _get_formula_variables(formula) - - assert "y" in result - assert "x1" in result - assert "x2" in result - - formula = "y ~ " - assert _get_formula_variables(formula) == ["y"] - - formula = "~ x1 + x2" - assert _get_formula_variables(formula) == ["x1", "x2"] + for cell_type in output.sdc["cells"]: + cells = output.sdc["cells"][cell_type] + for row, col in cells: + assert row < table.shape[0], ( + f"Row {row} out of bounds for {cell_type} cell in table shape {table.shape}" + ) + assert col < table.shape[1], ( + f"Column {col} out of bounds for {cell_type} cell in table shape {table.shape}" + ) + + value = table.iloc[row, col] + if cell_type in ["threshold", "p-ratio", "nk-rule"]: + assert pd.isna(value), ( + f"Cell at ({row}, {col}) in {cell_type} should be NaN but is {value}" + ) + + # margins=False + acro2 = ACRO(suppress=True) + table2 = acro2.crosstab(data.year, data.grant_type, margins=False) + output2 = acro2.results.get_index(0) + + for cell_type in output2.sdc["cells"]: + cells = output2.sdc["cells"][cell_type] + for row, col in cells: + assert row < table2.shape[0], ( + f"Row {row} out of bounds for {cell_type} cell in table shape {table2.shape}" + ) + assert col < table2.shape[1], ( + f"Column {col} out of bounds for {cell_type} cell in table shape {table2.shape}" + ) + + value = table2.iloc[row, col] + if cell_type in ["threshold", "p-ratio", "nk-rule"]: + assert pd.isna(value), ( + f"Cell at ({row}, {col}) in {cell_type} should be NaN but is {value}" + )