From 86b2eb2517319a6c153c0ba48ed49d94683b3b28 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Thu, 19 Feb 2026 15:24:02 +0000
Subject: [PATCH 01/29] feat: add summary generation and export functionality
 for results

- Add _extract_table_info() helper method to extract variables and record counts from table outputs
- Add _extract_regression_info() helper method to extract observation counts from regression outputs
- Add _mark_diff_risk() helper method to identify outputs with differencing risk
- Add write_summary() method to export summary to CSV
- Enhance finalise_excel() to include summary sheet as first sheet
- Add comprehensive test coverage for all helper methods and edge cases
---
 acro/record.py       | 177 +++++++++++++++++++++++++++++++++
 test/test_initial.py | 231 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 408 insertions(+)

diff --git a/acro/record.py b/acro/record.py
index aa29a5de..94e00ae4 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -430,6 +430,174 @@ def validate_outputs(self) -> None:
                 )
                 record.exception = input("")
 
+    def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int]:
+        """Extract variables and total records from table output.
+
+        Parameters
+        ----------
+        output : list
+            The output to extract information from.
+        method : str
+            The method used to generate the output.
+
+        Returns
+        -------
+        tuple[list[str], int]
+            Variables and total record count.
+        """
+        variables: list[str] = []
+        total_records: int = 0
+
+        for table in output:
+            if isinstance(table, DataFrame):
+                # Extract index names
+                if hasattr(table.index, "names") and any(table.index.names):
+                    variables.extend(str(n) for n in table.index.names if n is not None)
+                elif table.index.name:
+                    variables.append(str(table.index.name))
+
+                # Extract column names
+                if hasattr(table.columns, "names") and any(table.columns.names):
+                    variables.extend(
+                        str(n) for n in table.columns.names if n is not None
+                    )
+                elif table.columns.name:
+                    variables.append(str(table.columns.name))
+
+                # Calculate total records
+                try:
+                    total_records += int(table.shape[0] * table.shape[1])
+                    if method == "crosstab":
+                        cell_sum = table.values[~pd.isna(table.values)].sum()
+                        if cell_sum > 0:
+                            total_records = int(cell_sum)
+                except (TypeError, ValueError):
+                    pass
+
+        return variables, total_records
+
+    def _extract_regression_info(self, output: list) -> tuple[list[str], int]:
+        """Extract variables and total records from regression output.
+
+        Parameters
+        ----------
+        output : list
+            The output to extract information from.
+
+        Returns
+        -------
+        tuple[list[str], int]
+            Variables and total record count.
+        """
+        variables: list[str] = []
+        total_records: int = 0
+
+        for table in output:
+            if isinstance(table, DataFrame):
+                for idx in table.index:
+                    idx_str = str(idx)
+                    if "no. observations" in idx_str.lower():
+                        try:
+                            val = table.loc[idx].dropna().iloc[0]
+                            total_records = int(float(val))
+                        except (ValueError, TypeError, IndexError):
+                            pass
+                        break
+
+        return variables, total_records
+
+    def _mark_diff_risk(self, summary_df: DataFrame) -> DataFrame:
+        """Mark outputs with differencing risk.
+
+        Parameters
+        ----------
+        summary_df : DataFrame
+            The summary DataFrame to update.
+
+        Returns
+        -------
+        DataFrame
+            Updated summary DataFrame with diff_risk column.
+        """
+        if summary_df.empty:
+            summary_df["diff_risk"] = pd.Series(dtype=bool)
+        else:
+            summary_df["diff_risk"] = False
+            table_mask = summary_df["type"] == "table"
+            table_outputs = summary_df.loc[table_mask]
+            if not table_outputs.empty:
+                for _, group in table_outputs.groupby("variables"):
+                    if len(group) > 1:
+                        counts = group["total_records"].unique()
+                        if len(counts) > 1:
+                            summary_df.loc[group.index, "diff_risk"] = True
+
+        return summary_df
+
+    def generate_summary(self) -> DataFrame:
+        """Generate a summary DataFrame of all outputs in the session.
+
+        Provides output checkers with a high-level overview of all outputs,
+        including what method was used, what variables are involved, the
+        total record count, and whether there is a differencing risk.
+
+        Returns
+        -------
+        DataFrame
+            Summary of all outputs with columns: id, method, status, type,
+            command, summary, variables, total_records, timestamp, diff_risk.
+        """
+        rows = []
+        for uid, rec in self.results.items():
+            method = rec.properties.get("method", rec.output_type)
+            variables: list[str] = []
+            total_records: int = 0
+
+            if rec.output_type == "table":
+                variables, total_records = self._extract_table_info(rec.output, method)
+            elif rec.output_type == "regression":
+                variables, total_records = self._extract_regression_info(rec.output)
+                dof = rec.properties.get("dof")
+                if dof is not None:
+                    variables.append(f"dof={dof}")
+
+            variables_str = "; ".join(variables) if variables else ""
+
+            rows.append(
+                {
+                    "id": uid,
+                    "method": method,
+                    "status": rec.status,
+                    "type": rec.output_type,
+                    "command": rec.command,
+                    "summary": rec.summary,
+                    "variables": variables_str,
+                    "total_records": total_records,
+                    "timestamp": rec.timestamp,
+                }
+            )
+
+        summary_df = DataFrame(rows)
+        summary_df = self._mark_diff_risk(summary_df)
+
+        return summary_df
+
+    def write_summary(self, path: str) -> None:
+        """Write the session summary to a CSV file.
+
+        Parameters
+        ----------
+        path : str
+            Name of the output folder.
+        """
+        summary_df = self.generate_summary()
+        if summary_df.empty:
+            return
+        os.makedirs(path, exist_ok=True)
+        filename = os.path.normpath(f"{path}/summary.csv")
+        summary_df.to_csv(filename, index=False)
+        logger.info("summary written to: %s", filename)
+
     def finalise(self, path: str, ext: str, interactive: bool = False) -> None:
         """Create a results file for checking.
 
@@ -451,6 +619,7 @@ def finalise(self, path: str, ext: str, interactive: bool = False) -> None:
             self.finalise_excel(path)
         else:
             raise ValueError("Invalid file extension. Options: {json, xlsx}")
+        self.write_summary(path)
         self.write_checksums(path)
         # check if the directory acro_artifacts exists and delete it
         if os.path.exists("acro_artifacts"):
@@ -513,6 +682,13 @@ def finalise_excel(self, path: str) -> None:
         with pd.ExcelWriter(  # pylint: disable=abstract-class-instantiated
             filename, engine="openpyxl"
         ) as writer:
+            # summary sheet
+            summary_df = self.generate_summary()
+            if not summary_df.empty:
+                summary_df.to_excel(
+                    writer, sheet_name="summary", index=False, startrow=0
+                )
+
             # description sheet
             sheet = []
             summary = []
@@ -527,6 +703,7 @@ def finalise_excel(self, path: str) -> None:
                 {"Sheet": sheet, "Command": command, "Summary": summary}
             )
             tmp_df.to_excel(writer, sheet_name="description", index=False, startrow=0)
+
             # individual sheets
             for output_id, output in self.results.items():
                 if output.output_type == "custom":
diff --git a/test/test_initial.py b/test/test_initial.py
index 42e50c7b..591c8fde 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1250,3 +1250,234 @@ def test_toggle_suppression():
     assert acro.suppress
     acro.disable_suppression()
     assert not acro.suppress
+
+
+def test_summary_csv_created_with_json(data, acro):
+    """Test that summary.csv is created when finalising to JSON (Issue #224)."""
+    _ = acro.crosstab(data.year, data.grant_type)
+    acro.add_exception("output_0", "Let me have it")
+    acro.finalise(PATH, "json")
+    summary_path = os.path.normpath(f"{PATH}/summary.csv")
+    assert os.path.exists(summary_path), "summary.csv was not created"
+    summary_df = pd.read_csv(summary_path)
+    assert len(summary_df) == 1
+    assert "id" in summary_df.columns
+    assert "method" in summary_df.columns
+    assert "variables" in summary_df.columns
+    assert "total_records" in summary_df.columns
+    assert "diff_risk" in summary_df.columns
+    # check values
+    assert summary_df.iloc[0]["id"] == "output_0"
+    assert summary_df.iloc[0]["method"] == "crosstab"
+    assert summary_df.iloc[0]["total_records"] > 0
+    shutil.rmtree(PATH)
+
+
+def test_summary_sheet_in_excel(data, acro):
+    """Test that a 'summary' sheet is created in results.xlsx (Issue #224)."""
+    _ = acro.crosstab(data.year, data.grant_type)
+    _ = acro.pivot_table(
+        data, index=["grant_type"], values=["inc_grants"], aggfunc=["mean", "std"]
+    )
+    acro.add_exception("output_0", "Let me have it")
+    acro.add_exception("output_1", "I need this")
+    acro.finalise(PATH, "xlsx")
+    filename = os.path.normpath(f"{PATH}/results.xlsx")
+    xl = pd.ExcelFile(filename)
+    assert "summary" in xl.sheet_names, "'summary' sheet not found in Excel"
+    summary_df = pd.read_excel(filename, sheet_name="summary")
+    xl.close()
+    assert len(summary_df) == 2
+    methods = summary_df["method"].tolist()
+    assert "crosstab" in methods
+    assert "pivot_table" in methods
+    shutil.rmtree(PATH)
+
+
+def test_summary_variables_extracted(data):
+    """Test that variable names are correctly extracted into the summary."""
+    acro_obj = ACRO(suppress=False)
+    _ = acro_obj.crosstab(data.year, data.grant_type)
+    acro_obj.add_exception("output_0", "Let me have it")
+    acro_obj.finalise(PATH, "json")
+    summary_path = os.path.normpath(f"{PATH}/summary.csv")
+    summary_df = pd.read_csv(summary_path)
+    variables = summary_df.iloc[0]["variables"]
+    assert "year" in variables
+    assert "grant_type" in variables
+    shutil.rmtree(PATH)
+
+
+def test_summary_differencing_risk(data):
+    """Test that differencing risk is flagged when tables share variables but have different record counts (Issue #224)."""
+    acro_obj = ACRO(suppress=True)
+    _ = acro_obj.crosstab(data.year, data.grant_type)
+    data_subset = data.iloc[10:].copy()
+    _ = acro_obj.crosstab(data_subset.year, data_subset.grant_type)
+    acro_obj.add_exception("output_0", "Let me have it")
+    acro_obj.add_exception("output_1", "I need this")
+
+    summary_df = acro_obj.results.generate_summary()
+
+    assert summary_df.iloc[0]["variables"] == summary_df.iloc[1]["variables"]
+
+    assert summary_df.iloc[0]["total_records"] != summary_df.iloc[1]["total_records"]
+
+    assert summary_df.iloc[0]["diff_risk"] == True
+    assert summary_df.iloc[1]["diff_risk"] == True
+
+
+def test_summary_regression_metadata(data, acro):
+    """Test that regression outputs are correctly captured in the summary."""
+    new_df = data[["inc_activity", "inc_grants", "inc_donations", "total_costs"]]
+    new_df = new_df.dropna()
+    endog = new_df.inc_activity
+    exog = new_df[["inc_grants", "inc_donations", "total_costs"]]
+    exog = add_constant(exog)
+    _ = acro.ols(endog, exog)
+    summary_df = acro.results.generate_summary()
+    assert len(summary_df) == 1
+    row = summary_df.iloc[0]
+    assert row["method"] == "ols"
+    assert row["type"] == "regression"
+    assert row["total_records"] > 0
+    assert "dof=" in row["variables"]
+
+
+def test_summary_empty_session():
+    """Test that summary handles an empty session gracefully."""
+    acro_obj = ACRO(suppress=False)
+    summary_df = acro_obj.results.generate_summary()
+    assert summary_df.empty
+    assert "diff_risk" in summary_df.columns
+
+
+def test_extract_table_info(data, acro):
+    """Test the _extract_table_info helper method."""
+    acro.crosstab(data.year, data.grant_type)
+    output = acro.results.get_index(0)
+
+    # Test extracting info from table output with crosstab method
+    variables, total_records = acro.results._extract_table_info(
+        output.output, "crosstab"
+    )
+    assert len(variables) > 0
+    assert total_records > 0
+    assert "year" in variables or "grant_type" in variables
+
+    # Also test with non-crosstab method to ensure both paths work
+    variables2, total_records2 = acro.results._extract_table_info(
+        output.output, "pivot_table"
+    )
+    assert len(variables2) > 0
+    assert total_records2 > 0
+
+
+def test_extract_regression_info():
+    """Test the _extract_regression_info helper method with sample data."""
+    # Create a simple regression output for testing
+    records = Records()
+
+    # Create sample regression output
+    reg_output = pd.DataFrame(
+        {
+            "coef": [1.0, 2.0],
+            "std err": [0.1, 0.2],
+            "t": [10.0, 10.0],
+            "P>|t|": [0.0, 0.0],
+        },
+        index=["const", "x1"],
+    )
+
+    # Create a second table with "no. observations"
+    obs_output = pd.DataFrame({"Value": [100]}, index=["no. observations"])
+
+    output = [reg_output, obs_output]
+    variables, total_records = records._extract_regression_info(output)
+
+    # Should have extracted the observations
+    assert isinstance(variables, list)
+    assert total_records == 100
+
+
+def test_mark_diff_risk(data, acro):
+    """Test the _mark_diff_risk helper method."""
+    # Create two crosstabs with same variables but different data
+    acro.crosstab(data.year, data.grant_type)
+    acro.crosstab(data.year, data.grant_type)
+
+    summary_df = acro.results.generate_summary()
+
+    # Check that diff_risk column exists
+    assert "diff_risk" in summary_df.columns
+    # Both outputs should have diff_risk marked
+    assert all(isinstance(x, (bool, type(pd.NA))) for x in summary_df["diff_risk"])
+
+
+def test_extract_table_info_with_zero_cell_sum():
+    """Test _extract_table_info when cell_sum equals 0."""
+    records = Records()
+
+    # Create a table with all NaN values to trigger cell_sum = 0 path
+    table = pd.DataFrame(
+        [[np.nan, np.nan], [np.nan, np.nan]],
+        index=["row1", "row2"],
+        columns=["col1", "col2"],
+    )
+    table.index.name = "idx"
+    table.columns.name = "cols"
+
+    output = [table]
+    variables, total_records = records._extract_table_info(output, "crosstab")
+
+    # Should extract variables and use shape-based count when cell_sum is 0
+    assert "idx" in variables
+    assert "cols" in variables
+    assert total_records > 0
+
+
+def test_extract_table_info_exception_handling():
+    """Test _extract_table_info exception handling with invalid table."""
+    records = Records()
+
+    # Create a table that will cause TypeError during shape multiplication
+    class InvalidTable:
+        def __init__(self):
+            self.shape = "invalid"  # Non-integer shape to trigger TypeError
+
+    output = [InvalidTable()]
+    variables, total_records = records._extract_table_info(output, "crosstab")
+
+    # Should handle exception gracefully
+    assert isinstance(variables, list)
+    assert isinstance(total_records, int)
+
+
+def test_extract_regression_info_with_missing_observations():
+    """Test _extract_regression_info when observation value is NaN."""
+    records = Records()
+
+    # Create regression output with no. observations but NaN value
+    obs_output = pd.DataFrame({"Value": [np.nan]}, index=["no. observations"])
+
+    output = [obs_output]
+    variables, total_records = records._extract_regression_info(output)
+
+    # Should handle NaN gracefully
+    assert isinstance(variables, list)
+    assert total_records == 0
+
+
+def test_extract_regression_info_with_invalid_value():
+    """Test _extract_regression_info when observation value is non-numeric."""
+    records = Records()
+
+    # Create regression output with invalid observation value
+    obs_output = pd.DataFrame({"Value": ["not_a_number"]}, index=["no. observations"])
+
+    output = [obs_output]
+    variables, total_records = records._extract_regression_info(output)
+
+    # Should handle ValueError gracefully
+    assert isinstance(variables, list)
+    assert total_records == 0

From ea0a63ed6f33e7500526576673fbd2a6fdbb760d Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Thu, 19 Feb 2026 15:38:21 +0000
Subject: [PATCH 02/29] test: add coverage tests for table info extraction

---
 test/test_initial.py | 64 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/test/test_initial.py b/test/test_initial.py
index 591c8fde..da36993a 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1481,3 +1481,67 @@ def test_extract_regression_info_with_invalid_value():
     # Should handle ValueError gracefully
     assert isinstance(variables, list)
     assert total_records == 0
+
+
+def test_extract_table_info_with_numeric_data():
+    """Test _extract_table_info with numeric data to cover line 456 and 465."""
+    records = Records()
+
+    # Create a table with numeric values (not NaN)
+    table = pd.DataFrame(
+        [[10, 20], [30, 40]],
+        index=["row1", "row2"],
+        columns=["col1", "col2"],
+    )
+    table.index.name = "idx"
+    table.columns.name = "cols"
+
+    output = [table]
+    # Test with crosstab method to trigger the cell_sum > 0 path
+    variables, total_records = records._extract_table_info(output, "crosstab")
+
+    # Should extract variables
+    assert "idx" in variables
+    assert "cols" in variables
+    # With numeric data, cell_sum will be 100 (10+20+30+40), so total_records should be 100
+    assert total_records == 100
+
+
+def test_extract_table_info_with_mixed_data():
+    """Test _extract_table_info with mixed NaN and numeric data."""
+    records = Records()
+
+    # Create a table with some NaN and some numeric values
+    table = pd.DataFrame(
+        [[10, np.nan], [np.nan, 40]],
+        index=["row1", "row2"],
+        columns=["col1", "col2"],
+    )
+    table.index.name = "idx"
+
+    output = [table]
+    variables, total_records = records._extract_table_info(output, "crosstab")
+
+    # Should extract variables
+    assert "idx" in variables
+    # cell_sum = 10 + 40 = 50, so total_records should be 50
+    assert total_records == 50
+
+
+def test_generate_summary_with_crosstab(data, acro):
+    """Test generate_summary triggers _extract_table_info coverage for lines 456-465."""
+    # Create actual crosstab output to trigger generate_summary path
+    _ = acro.crosstab(data.year, data.grant_type)
+
+    # Call generate_summary which will call _extract_table_info
+    summary_df = acro.results.generate_summary()
+
+    # Verify summary was generated
+    assert len(summary_df) > 0
+    assert "variables" in summary_df.columns
+    assert "total_records" in summary_df.columns
+    # Should have extracted year and grant_type as variables
+    assert (
+        "year" in summary_df.iloc[0]["variables"]
+        or "grant_type" in summary_df.iloc[0]["variables"]
+    )

From f3c6d96970ec2da6f3dbccd7b20a47b5cf0da3ff Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Thu, 19 Feb 2026 16:09:26 +0000
Subject: [PATCH 03/29] fix coverage

---
 test/test_initial.py | 85 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 4 deletions(-)

diff --git a/test/test_initial.py b/test/test_initial.py
index da36993a..a7056e1e 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -555,10 +555,14 @@ def test_add_to_acro(data, monkeypatch):
     src_path = "test_add_to_acro"
     dest_path = "sdc_results"
     file_path = "crosstab.pkl"
-    if not os.path.exists(src_path):  # pragma no cover
-        table.to_pickle(file_path)
-        os.mkdir(src_path)
-        shutil.move(file_path, src_path, copy_function=shutil.copytree)
+    # ensure a clean state before setup
+    if os.path.exists(src_path):
+        shutil.rmtree(src_path)
+    if os.path.exists(dest_path):
+        shutil.rmtree(dest_path)
+    table.to_pickle(file_path)
+    os.mkdir(src_path)
+    shutil.move(file_path, src_path, copy_function=shutil.copytree)
     # add exception to the output
     exception = ["I want it"]
     monkeypatch.setattr("builtins.input", lambda _: exception.pop(0))
@@ -566,6 +570,8 @@ def test_add_to_acro(data, monkeypatch):
     add_to_acro(src_path, dest_path)
     assert "results.json" in os.listdir(dest_path)
     assert "crosstab.pkl" in os.listdir(dest_path)
+    shutil.rmtree(src_path)
+    shutil.rmtree(dest_path)
 
 
 def test_prettify_tablestring(data):
@@ -1545,3 +1551,74 @@ def test_generate_summary_with_crosstab(data, acro):
         "year" in summary_df.iloc[0]["variables"]
         or "grant_type" in summary_df.iloc[0]["variables"]
     )
+
+
+def test_extract_table_info_elif_index_name():
+    """Cover lines 456-457: elif table.index.name branch.
+
+    Triggered when index.names has no truthy entries (any() is False)
+    but index.name is set.
+    """
+    records = Records()
+
+    table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"])
+    table.index.name = "myindex"
+
+    # Subclass the index type to override `names` so any(names) is False,
+    # while `name` still returns "myindex" (reads from _names[0]).
+    idx_type = type(table.index)
+    patched_idx = type("PatchedIdx", (idx_type,), {"names": property(lambda _: [None])})
+    table.index.__class__ = patched_idx
+
+    output = [table]
+    variables, total_records = records._extract_table_info(output, "linear")
+
+    assert "myindex" in variables
+
+
+def test_extract_table_info_elif_columns_name():
+    """Cover line 465: elif table.columns.name branch.
+
+    Triggered when columns.names has no truthy entries (any() is False)
+    but columns.name is set.
+    """
+    records = Records()
+
+    table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"])
+    table.columns.name = "mycols"
+
+    # Override index.names to avoid the `if` branch for index (no index name set).
+    # Override columns.names so any() is False, forcing the elif for columns.
+    col_type = type(table.columns)
+    patched_idx = type("PatchedIdx", (col_type,), {"names": property(lambda _: [None])})
+    table.columns.__class__ = patched_idx
+
+    output = [table]
+    variables, total_records = records._extract_table_info(output, "linear")
+
+    assert "mycols" in variables
+
+
+def test_extract_table_info_shape_type_error():
+    """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info.
+
+    Triggered when shape multiplication raises TypeError (e.g. shape returns
+    (None, None) so None * None raises TypeError).
+    """
+    records = Records()
+
+    table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"])
+    table.index.name = "idx"
+
+    class BrokenShapeDF(pd.DataFrame):
+        @property
+        def shape(self):
+            return (None, None)
+
+    table.__class__ = BrokenShapeDF
+
+    output = [table]
+    variables, total_records = records._extract_table_info(output, "linear")
+
+    assert isinstance(variables, list)
+    assert total_records == 0

From f9ef74ec853564f062ce69cee8c78c22dd2ca912 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Thu, 19 Feb 2026 16:21:32 +0000
Subject: [PATCH 04/29] test: add coverage for empty summary edge case (line
 595)

---
 test/test_initial.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/test/test_initial.py b/test/test_initial.py
index a7056e1e..f04a07fc 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1622,3 +1622,17 @@ def shape(self):
 
     assert isinstance(variables, list)
     assert total_records == 0
+
+
+def test_write_summary_empty_session(tmp_path):
+    """Test write_summary when there are no outputs (line 595 coverage)."""
+    records = Records()
+
+    # Create a temp directory for output
+    output_path = str(tmp_path / "empty_summary")
+
+    # Call write_summary with no outputs - should return early without creating file
+    records.write_summary(output_path)
+
+    # Verify no file was created since summary was empty
+    assert not os.path.exists(os.path.normpath(f"{output_path}/summary.csv"))

From 47359b2fde398fb96a03d6ed28feb844327150af Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Thu, 19 Feb 2026 16:24:22 +0000
Subject: [PATCH 05/29] test: remove unused variable idx_type

---
 test/test_initial.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/test_initial.py b/test/test_initial.py
index f04a07fc..98ce074f 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1566,8 +1566,11 @@ def test_extract_table_info_elif_index_name():
 
     # Subclass the index type to override `names` so any(names) is False,
     # while `name` still returns "myindex" (reads from _names[0]).
-    idx_type = type(table.index)
-    patched_idx = type("PatchedIdx", (idx_type,), {"names": property(lambda _: [None])})
+    patched_idx = type(
+        "PatchedIdx",
+        (type(table.index),),
+        {"names": property(lambda _: [None])},
+    )
     table.index.__class__ = patched_idx
 
     output = [table]

From f817df23c882bc09e606d9be8df6fb96258ffe94 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Thu, 19 Feb 2026 16:28:59 +0000
Subject: [PATCH 06/29] test: fix unused variable warnings by using underscore

---
 test/test_initial.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_initial.py b/test/test_initial.py
index 98ce074f..cb27c229 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1574,7 +1574,7 @@ def test_extract_table_info_elif_index_name():
     table.index.__class__ = patched_idx
 
     output = [table]
-    variables, total_records = records._extract_table_info(output, "linear")
+    variables, _ = records._extract_table_info(output, "linear")
 
     assert "myindex" in variables
 
@@ -1597,7 +1597,7 @@ def test_extract_table_info_elif_columns_name():
     table.columns.__class__ = patched_idx
 
     output = [table]
-    variables, total_records = records._extract_table_info(output, "linear")
+    variables, _ = records._extract_table_info(output, "linear")
 
     assert "mycols" in variables
 

From e8bc9267c4374452a66f8a65c3ece04e05a744d0 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Thu, 5 Mar 2026 09:38:56 +0000
Subject: [PATCH 07/29] feat: implement session summary with differencing risk
 detection

- Add generate_summary() to provide high-level output overview for checkers

- Add multi-layered 'DO NOT RELEASE' warnings (filename, comment)
- Integrate session summary into JSON and Excel outputs

Resolves #224
---
 acro/record.py       | 49 +++++++++++++++++++++++++++++---------------
 test/test_initial.py | 24 +++++++++++-----------
 2 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/acro/record.py b/acro/record.py
index 94e00ae4..dea7238a 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -450,13 +450,11 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int
 
         for table in output:
             if isinstance(table, DataFrame):
-                # Extract index names
                 if hasattr(table.index, "names") and any(table.index.names):
                     variables.extend(str(n) for n in table.index.names if n is not None)
                 elif table.index.name:
                     variables.append(str(table.index.name))
 
-                # Extract column names
                 if hasattr(table.columns, "names") and any(table.columns.names):
                     variables.extend(
                         str(n) for n in table.columns.names if n is not None
@@ -464,7 +462,6 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int
                 elif table.columns.name:
                     variables.append(str(table.columns.name))
 
-                # Calculate total records
                 try:
                     total_records += int(table.shape[0] * table.shape[1])
                     if method == "crosstab":
@@ -549,6 +546,8 @@ def generate_summary(self) -> DataFrame:
         """
         rows = []
         for uid, rec in self.results.items():
+            if rec.output_type == "custom":
+                continue
             method = rec.properties.get("method", rec.output_type)
             variables: list[str] = []
             total_records: int = 0
@@ -582,21 +581,28 @@ def generate_summary(self) -> DataFrame:
 
         return summary_df
 
-    def write_summary(self, path: str) -> None:
-        """Write the session summary to a CSV file.
+    def add_summary_to_results(self) -> None:
+        """Add the summary DataFrame as a custom output to results.
 
-        Parameters
-        ----------
-        path : str
-            Name of the output folder.
+        This generates a summary of all outputs in the session with metadata
+        about variables, record counts, and differencing risk. The file is
+        marked with a clear warning not to release.
         """
         summary_df = self.generate_summary()
         if summary_df.empty:
             return
-        os.makedirs(path, exist_ok=True)
-        filename = os.path.normpath(f"{path}/summary.csv")
-        summary_df.to_csv(filename, index=False)
-        logger.info("summary written to: %s", filename)
+
+        os.makedirs("acro_artifacts", exist_ok=True)
+        # Use explicit filename to indicate this should not be released
+        summary_path = os.path.normpath(
+            "acro_artifacts/DO_NOT_RELEASE_session_summary.csv"
+        )
+        summary_df.to_csv(summary_path, index=False)
+
+        self.add_custom(
+            summary_path,
+            "⚠️  WARNING: DO NOT RELEASE - Session summary for output checker use only",
+        )
 
     def finalise(self, path: str, ext: str, interactive: bool = False) -> None:
         """Create a results file for checking.
@@ -613,13 +619,13 @@ def finalise(self, path: str, ext: str, interactive: bool = False) -> None:
         logger.debug("finalise()")
         if interactive:
             self.validate_outputs()
+        self.add_summary_to_results()
         if ext == "json":
             self.finalise_json(path)
         elif ext == "xlsx":
             self.finalise_excel(path)
         else:
             raise ValueError("Invalid file extension. Options: {json, xlsx}")
-        self.write_summary(path)
         self.write_checksums(path)
         # check if the directory acro_artifacts exists and delete it
         if os.path.exists("acro_artifacts"):
@@ -653,7 +659,19 @@ def finalise_json(self, path: str) -> None:
             for file in files:
                 outputs[key]["files"].append({"name": file, "sdc": val.sdc})
 
-        results: dict = {"version": __version__, "results": outputs}
+        # Generate and include session summary for output checkers
+        summary_df = self.generate_summary()
+        session_summary = {
+            "DO_NOT_RELEASE": True,
+            "purpose": "Session summary for output checker use only",
+            "data": json.loads(summary_df.to_json(orient="records")),
+        }
+
+        results: dict = {
+            "version": __version__,
+            "results": outputs,
+            "session_summary": session_summary,
+        }
         filename: str = os.path.normpath(f"{path}/results.json")
         try:
             with open(filename, "w", newline="", encoding="utf-8") as handle:
@@ -703,7 +721,6 @@ def finalise_excel(self, path: str) -> None:
                 {"Sheet": sheet, "Command": command, "Summary": summary}
             )
             tmp_df.to_excel(writer, sheet_name="description", index=False, startrow=0)
-
             # individual sheets
             for output_id, output in self.results.items():
                 if output.output_type == "custom":
diff --git a/test/test_initial.py b/test/test_initial.py
index cb27c229..c3e03ff5 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1259,12 +1259,14 @@ def test_toggle_suppression():
 
 
 def test_summary_csv_created_with_json(data, acro):
-    """Test that summary.csv is created when finalising to JSON (Issue #224)."""
+    """Test that DO_NOT_RELEASE_session_summary.csv is created when finalising to JSON (Issue #224)."""
     _ = acro.crosstab(data.year, data.grant_type)
     acro.add_exception("output_0", "Let me have it")
     acro.finalise(PATH, "json")
-    summary_path = os.path.normpath(f"{PATH}/summary.csv")
-    assert os.path.exists(summary_path), "summary.csv was not created"
+    summary_path = os.path.normpath(f"{PATH}/DO_NOT_RELEASE_session_summary.csv")
+    assert os.path.exists(summary_path), (
+        "DO_NOT_RELEASE_session_summary.csv was not created"
+    )
     summary_df = pd.read_csv(summary_path)
     assert len(summary_df) == 1
     assert "id" in summary_df.columns
@@ -1306,7 +1308,7 @@ def test_summary_variables_extracted(data):
     _ = acro_obj.crosstab(data.year, data.grant_type)
     acro_obj.add_exception("output_0", "Let me have it")
     acro_obj.finalise(PATH, "json")
-    summary_path = os.path.normpath(f"{PATH}/summary.csv")
+    summary_path = os.path.normpath(f"{PATH}/DO_NOT_RELEASE_session_summary.csv")
     summary_df = pd.read_csv(summary_path)
     variables = summary_df.iloc[0]["variables"]
     assert "year" in variables
@@ -1627,15 +1629,13 @@ def shape(self):
     assert total_records == 0
 
 
-def test_write_summary_empty_session(tmp_path):
-    """Test write_summary when there are no outputs (line 595 coverage)."""
+def test_write_summary_empty_session():
+    """Test add_summary_to_results when there are no outputs (line 595 coverage)."""
     records = Records()
 
-    # Create a temp directory for output
-    output_path = str(tmp_path / "empty_summary")
-
-    # Call write_summary with no outputs - should return early without creating file
-    records.write_summary(output_path)
+    records.add_summary_to_results()
 
     # Verify no file was created since summary was empty
-    assert not os.path.exists(os.path.normpath(f"{output_path}/summary.csv"))
+    assert not os.path.exists(
+        os.path.normpath("acro_artifacts/DO_NOT_RELEASE_session_summary.csv")
+    )

From e3665ffe1afc445e22428cf7e7de995c68d29976 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Thu, 5 Mar 2026 11:03:28 +0000
Subject: [PATCH 08/29] feat(record): update warning message

---
 acro/record.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/acro/record.py b/acro/record.py
index dea7238a..5f17bfe9 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -601,7 +601,7 @@ def add_summary_to_results(self) -> None:
 
         self.add_custom(
             summary_path,
-            "⚠️  WARNING: DO NOT RELEASE - Session summary for output checker use only",
+            "WARNING: DO NOT RELEASE - Session summary for output checker use only",
         )
 
     def finalise(self, path: str, ext: str, interactive: bool = False) -> None:

From 404bba3e7b3af20c44834b3f6cfad58208adf5fe Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Thu, 5 Mar 2026 13:43:53 +0000
Subject: [PATCH 09/29] Add generate_summary() to provide high-level output
 overview for checkers

- Add multi-layered 'DO NOT RELEASE' warnings (filename, comment)
- Integrate session summary into JSON and Excel outputs
---
 acro/record.py | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/acro/record.py b/acro/record.py
index b5d1ac6b..62cacf9d 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -463,16 +463,39 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int
                     variables.append(str(table.columns.name))
 
                 try:
-                    total_records += int(table.shape[0] * table.shape[1])
                     if method == "crosstab":
-                        cell_sum = table.values[~pd.isna(table.values)].sum()
-                        if cell_sum > 0:
-                            total_records = int(cell_sum)
+                        total_records = self._get_crosstab_record_count(table)
+                    else:
+                        total_records += int(table.shape[0] * table.shape[1])
                 except (TypeError, ValueError):
                     pass
 
         return variables, total_records
 
+    def _get_crosstab_record_count(self, table: DataFrame) -> int:
+        """Get record count from crosstab, excluding margins.
+
+        Parameters
+        ----------
+        table : DataFrame
+            The crosstab table.
+
+        Returns
+        -------
+        int
+            The record count.
+        """
+        working_table = table.copy()
+        if "All" in working_table.index:
+            working_table = working_table.drop("All", axis=0)
+        if "All" in working_table.columns:
+            working_table = working_table.drop("All", axis=1)
+
+        cell_sum = working_table.values[~pd.isna(working_table.values)].sum()
+        if cell_sum > 0:
+            return int(cell_sum)
+        return int(table.shape[0] * table.shape[1])
+
     def _extract_regression_info(self, output: list) -> tuple[list[str], int]:
         """Extract variables and total records from regression output.
 

From fcbe9fae65a990475f1f769e12fc69c670addd68 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Thu, 5 Mar 2026 13:51:46 +0000
Subject: [PATCH 10/29] - Add multi-layered 'DO NOT RELEASE' warnings
 (filename, comment) - Integrate session summary into JSON and Excel outputs

---
 test/test_initial.py | 123 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)

diff --git a/test/test_initial.py b/test/test_initial.py
index 49b63d4a..e572e52d 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1702,3 +1702,126 @@ def test_write_summary_empty_session():
     assert not os.path.exists(
         os.path.normpath("acro_artifacts/DO_NOT_RELEASE_session_summary.csv")
     )
+
+
+def test_get_crosstab_record_count_with_margins():
+    """Test _get_crosstab_record_count correctly excludes margin rows/columns.
+
+    This tests lines 488-492 of record.py to ensure that "All" row and column
+    are properly excluded when calculating record counts from crosstabs.
+    """
+    records = Records()
+
+    # Create a table with margins (like what pandas crosstab produces)
+    table = pd.DataFrame(
+        [[5, 5, 10], [5, 5, 10], [10, 10, 20]],
+        index=pd.Index([2010, 2011, "All"], name="year"),
+        columns=pd.Index(["A", "B", "All"], name="grant_type")
+    )
+
+    # Call _get_crosstab_record_count
+    count = records._get_crosstab_record_count(table)
+
+    # Should sum only the non-margin cells: 5+5+5+5 = 20
+    # Should NOT include the "All" row or column: [10, 10, 20] from row and [10, 10, 20] from col
+    assert count == 20
+
+
+def test_get_crosstab_record_count_without_margins():
+    """Test _get_crosstab_record_count with a table that has no margins.
+
+    When there is no "All" row or column, should sum all cells normally.
+    """
+    records = Records()
+
+    # Create a table without margins
+    table = pd.DataFrame(
+        [[5, 5], [5, 5]],
+        index=pd.Index([2010, 2011], name="year"),
+        columns=pd.Index(["A", "B"], name="grant_type")
+    )
+
+    count = records._get_crosstab_record_count(table)
+
+    # Should sum all cells: 5+5+5+5 = 20
+    assert count == 20
+
+
+def test_get_crosstab_record_count_with_margin_row_only():
+    """Test _get_crosstab_record_count when only margin row exists.
+
+    This covers the case where "All" is in the index but not in columns.
+    """
+    records = Records()
+
+    # Create a table with margin row but not margin column
+    table = pd.DataFrame(
+        [[5, 5], [5, 5], [10, 10]],
+        index=pd.Index([2010, 2011, "All"], name="year"),
+        columns=pd.Index(["A", "B"], name="grant_type")
+    )
+
+    count = records._get_crosstab_record_count(table)
+
+    # Should exclude "All" row and sum: 5+5+5+5 = 20
+    assert count == 20
+
+
+def test_get_crosstab_record_count_with_margin_column_only():
+    """Test _get_crosstab_record_count when only margin column exists.
+
+    This covers the case where "All" is in columns but not in the index.
+    """
+    records = Records()
+
+    # Create a table with margin column but not margin row
+    table = pd.DataFrame(
+        [[5, 5, 10], [5, 5, 10]],
+        index=pd.Index([2010, 2011], name="year"),
+        columns=pd.Index(["A", "B", "All"], name="grant_type")
+    )
+
+    count = records._get_crosstab_record_count(table)
+
+    # Should exclude "All" column and sum: 5+5+5+5 = 20
+    assert count == 20
+
+
+def test_get_crosstab_record_count_with_nan_values():
+    """Test _get_crosstab_record_count correctly handles NaN values.
+
+    This tests the isna() check when summing cell values (line 493).
+    """
+    records = Records()
+
+    # Create a table with some NaN values (suppressed cells)
+    table = pd.DataFrame(
+        [[np.nan, 5, 5], [5, np.nan, 5], [10, 10, 20]],
+        index=pd.Index([2010, 2011, "All"], name="year"),
+        columns=pd.Index(["A", "B", "C"], name="grant_type")
+    )
+
+    count = records._get_crosstab_record_count(table)
+
+    # Should sum only non-NaN, non-margin cells: 5+5+5+5 = 20
+    assert count == 20
+
+
+def test_get_crosstab_record_count_fallback():
+    """Test _get_crosstab_record_count fallback when cell_sum is 0.
+
+    This tests lines 494-495 where fallback calculation is used.
+    """
+    records = Records()
+
+    # Create a table with all zeros or NaN (simulating all suppressed)
+    table = pd.DataFrame(
+        [[np.nan, np.nan], [np.nan, np.nan]],
+        index=pd.Index([2010, 2011], name="year"),
+        columns=pd.Index(["A", "B"], name="grant_type")
+    )
+
+    count = records._get_crosstab_record_count(table)
+
+    # Should use fallback: table.shape[0] * table.shape[1] = 2 * 2 = 4
+    assert count == 4

From 4c16dcd7cb4521d59142ba441aa7552014302fe7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 5 Mar 2026 13:52:12 +0000
Subject: [PATCH 11/29] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 test/test_initial.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/test_initial.py b/test/test_initial.py
index e572e52d..14f49d9b 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1716,7 +1716,7 @@ def test_get_crosstab_record_count_with_margins():
     table = pd.DataFrame(
         [[5, 5, 10], [5, 5, 10], [10, 10, 20]],
         index=pd.Index([2010, 2011, "All"], name="year"),
-        columns=pd.Index(["A", "B", "All"], name="grant_type")
+        columns=pd.Index(["A", "B", "All"], name="grant_type"),
     )
 
     # Call _get_crosstab_record_count
@@ -1738,7 +1738,7 @@ def test_get_crosstab_record_count_without_margins():
     table = pd.DataFrame(
         [[5, 5], [5, 5]],
         index=pd.Index([2010, 2011], name="year"),
-        columns=pd.Index(["A", "B"], name="grant_type")
+        columns=pd.Index(["A", "B"], name="grant_type"),
     )
 
     count = records._get_crosstab_record_count(table)
@@ -1758,7 +1758,7 @@ def test_get_crosstab_record_count_with_margin_row_only():
     table = pd.DataFrame(
         [[5, 5], [5, 5], [10, 10]],
         index=pd.Index([2010, 2011, "All"], name="year"),
-        columns=pd.Index(["A", "B"], name="grant_type")
+        columns=pd.Index(["A", "B"], name="grant_type"),
     )
 
     count = records._get_crosstab_record_count(table)
@@ -1778,7 +1778,7 @@ def test_get_crosstab_record_count_with_margin_column_only():
     table = pd.DataFrame(
         [[5, 5, 10], [5, 5, 10]],
         index=pd.Index([2010, 2011], name="year"),
-        columns=pd.Index(["A", "B", "All"], name="grant_type")
+        columns=pd.Index(["A", "B", "All"], name="grant_type"),
     )
 
     count = records._get_crosstab_record_count(table)
@@ -1798,7 +1798,7 @@ def test_get_crosstab_record_count_with_nan_values():
     table = pd.DataFrame(
         [[np.nan, 5, 5], [5, np.nan, 5], [10, 10, 20]],
         index=pd.Index([2010, 2011, "All"], name="year"),
-        columns=pd.Index(["A", "B", "C"], name="grant_type")
+        columns=pd.Index(["A", "B", "C"], name="grant_type"),
     )
 
     count = records._get_crosstab_record_count(table)
@@ -1818,7 +1818,7 @@ def test_get_crosstab_record_count_fallback():
     table = pd.DataFrame(
         [[np.nan, np.nan], [np.nan, np.nan]],
         index=pd.Index([2010, 2011], name="year"),
-        columns=pd.Index(["A", "B"], name="grant_type")
+        columns=pd.Index(["A", "B"], name="grant_type"),
     )
 
     count = records._get_crosstab_record_count(table)

From e1271ae43a93d3016ef7cc20465e34ca50324016 Mon Sep 17 00:00:00 2001
From: Jessica Ikechukwu <Jessica.Ikechukwu@uwe.ac.uk>
Date: Fri, 6 Mar 2026 06:00:13 +0000
Subject: [PATCH 12/29] Remove tests for index and columns name extraction

Remove tests for extracting table info based on index and columns names.

Signed-off-by: Jessica Ikechukwu <Jessica.Ikechukwu@uwe.ac.uk>
---
 test/test_initial.py | 48 --------------------------------------------
 1 file changed, 48 deletions(-)

diff --git a/test/test_initial.py b/test/test_initial.py
index 14f49d9b..e1c9f8ee 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1618,54 +1618,6 @@ def test_generate_summary_with_crosstab(data, acro):
     )
 
 
-def test_extract_table_info_elif_index_name():
-    """Cover lines 456-457: elif table.index.name branch.
-
-    Triggered when index.names has no truthy entries (any() is False)
-    but index.name is set.
-    """
-    records = Records()
-
-    table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"])
-    table.index.name = "myindex"
-
-    # Subclass the index type to override `names` so any(names) is False,
-    # while `name` still returns "myindex" (reads from _names[0]).
-    patched_idx = type(
-        "PatchedIdx",
-        (type(table.index),),
-        {"names": property(lambda _: [None])},
-    )
-    table.index.__class__ = patched_idx
-
-    output = [table]
-    variables, _ = records._extract_table_info(output, "linear")
-
-    assert "myindex" in variables
-
-
-def test_extract_table_info_elif_columns_name():
-    """Cover line 465: elif table.columns.name branch.
-
-    Triggered when columns.names has no truthy entries (any() is False)
-    but columns.name is set.
-    """
-    records = Records()
-
-    table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"])
-    table.columns.name = "mycols"
-
-    # Override index.names to avoid the `if` branch for index (no index name set).
-    # Override columns.names so any() is False, forcing the elif for columns.
-    col_type = type(table.columns)
-    patched_idx = type("PatchedIdx", (col_type,), {"names": property(lambda _: [None])})
-    table.columns.__class__ = patched_idx
-
-    output = [table]
-    variables, _ = records._extract_table_info(output, "linear")
-
-    assert "mycols" in variables
-
 
 def test_extract_table_info_shape_type_error():
     """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info.

From 6a96fd21ab3cdf32ca5ae27a170c2e39a4d46b8a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 6 Mar 2026 06:00:27 +0000
Subject: [PATCH 13/29] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 test/test_initial.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_initial.py b/test/test_initial.py
index e1c9f8ee..a56fb7d0 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1618,7 +1618,6 @@ def test_generate_summary_with_crosstab(data, acro):
     )
 
 
-
 def test_extract_table_info_shape_type_error():
     """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info.
 

From c2cd7b6b3c0721ea79d322b510caf7713eceeba1 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Fri, 6 Mar 2026 10:09:57 +0000
Subject: [PATCH 14/29] refactor: remove unreachable elif branches and
 unnecessary tests - All core functionality preserved and tested with real
 DataFrame scenarios

---
 acro/record.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/acro/record.py b/acro/record.py
index 62cacf9d..4e896cfd 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -452,15 +452,11 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int
             if isinstance(table, DataFrame):
                 if hasattr(table.index, "names") and any(table.index.names):
                     variables.extend(str(n) for n in table.index.names if n is not None)
-                elif table.index.name:
-                    variables.append(str(table.index.name))
 
                 if hasattr(table.columns, "names") and any(table.columns.names):
                     variables.extend(
                         str(n) for n in table.columns.names if n is not None
                     )
-                elif table.columns.name:
-                    variables.append(str(table.columns.name))
 
                 try:
                     if method == "crosstab":

From ee4652884225269283f1435440f2d519da5380b3 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Tue, 10 Mar 2026 10:33:07 +0000
Subject: [PATCH 15/29] Fixes issue where tables with identical variables but
 different suppression settings were not being flagged as having differencing
 risk.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changes:
•⁠  ⁠Simplified _extract_table_info to use consistent cell counting
•⁠  ⁠Fixed suppression state extraction from sdc["summary"]["suppressed"]
•⁠  ⁠Updated _mark_diff_risk to detect different suppression settings
•⁠  ⁠Removed unnecessary _get_crosstab_record_count method
---
 acro/record.py       | 159 ++++++++++++++++++++++------
 test/test_initial.py | 247 ++++++++++++++-----------------------------
 2 files changed, 204 insertions(+), 202 deletions(-)

diff --git a/acro/record.py b/acro/record.py
index 4e896cfd..62eceb8c 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -430,7 +430,12 @@ def validate_outputs(self) -> None:
                 )
                 record.exception = input("")
 
-    def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int]:
+    def _extract_table_info(
+        self,
+        output: list,
+        method: str,  # noqa: ARG002
+        properties: dict,  # noqa: ARG002
+    ) -> tuple[list[str], int]:
         """Extract variables and total records from table output.
 
         Parameters
@@ -439,6 +444,8 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int
             The output to extract information from.
         method : str
             The method used to generate the output.
+        properties : dict
+            Properties dictionary (unused but kept for compatibility).
 
         Returns
         -------
@@ -459,39 +466,17 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int
                     )
 
                 try:
-                    if method == "crosstab":
-                        total_records = self._get_crosstab_record_count(table)
+                    # Count non-NaN cells for record count
+                    cell_sum = table.values[~pd.isna(table.values)].sum()
+                    if cell_sum > 0:
+                        total_records = int(cell_sum)
                     else:
-                        total_records += int(table.shape[0] * table.shape[1])
+                        total_records = int(table.shape[0] * table.shape[1])
                 except (TypeError, ValueError):
                     pass
 
         return variables, total_records
 
-    def _get_crosstab_record_count(self, table: DataFrame) -> int:
-        """Get record count from crosstab, excluding margins.
-
-        Parameters
-        ----------
-        table : DataFrame
-            The crosstab table.
-
-        Returns
-        -------
-        int
-            The record count.
-        """
-        working_table = table.copy()
-        if "All" in working_table.index:
-            working_table = working_table.drop("All", axis=0)
-        if "All" in working_table.columns:
-            working_table = working_table.drop("All", axis=1)
-
-        cell_sum = working_table.values[~pd.isna(working_table.values)].sum()
-        if cell_sum > 0:
-            return int(cell_sum)
-        return int(table.shape[0] * table.shape[1])
-
     def _extract_regression_info(self, output: list) -> tuple[list[str], int]:
         """Extract variables and total records from regression output.
 
@@ -544,12 +529,113 @@ def _mark_diff_risk(self, summary_df: DataFrame) -> DataFrame:
             if not table_outputs.empty:
                 for _, group in table_outputs.groupby("variables"):
                     if len(group) > 1:
-                        counts = group["total_records"].unique()
-                        if len(counts) > 1:
+                        # Check for different suppression settings
+                        suppressions = group["suppression"].unique()
+                        # Risk if same variables with different suppression settings
+                        if len(suppressions) > 1:
                             summary_df.loc[group.index, "diff_risk"] = True
 
         return summary_df
 
+    def _extract_all_variables(self) -> list[str]:
+        """Extract all unique variables across all outputs.
+
+        Returns
+        -------
+        list[str]
+            Sorted list of unique variable names.
+        """
+        all_variables: set[str] = set()
+
+        for rec in self.results.values():
+            if rec.output_type == "custom":
+                continue
+            variables = self._get_output_variables(rec)
+            all_variables.update(variables)
+
+        return sorted(all_variables)
+
+    def _get_output_variables(self, rec) -> list[str]:
+        """Extract variables from a single record.
+        
+        Parameters
+        ----------
+        rec : Record
+            The record to extract variables from.
+        
+        Returns
+        -------
+        list[str]
+            List of variable names.
+        """
+        method = rec.properties.get("method", rec.output_type)
+        variables: list[str] = []
+
+        if rec.output_type == "table":
+            variables, _ = self._extract_table_info(rec.output, method, rec.properties)
+        elif rec.output_type == "regression":
+            variables, _ = self._extract_regression_info(rec.output)
+            dof = rec.properties.get("dof")
+            if dof is not None:
+                variables.append(f"dof={dof}")
+
+        return variables
+
+
+    def _build_variable_matrix(self, summary_df: DataFrame) -> DataFrame:
+        """Build a variable-output matrix showing variable usage.
+
+        Parameters
+        ----------
+        summary_df : DataFrame
+            The base summary DataFrame.
+
+        Returns
+        -------
+        DataFrame
+            Summary with binary variable columns added.
+        """
+        all_variables = self._extract_all_variables()
+
+        if not all_variables:
+            return summary_df
+
+        # Create binary columns for each variable
+        for var in all_variables:
+            summary_df[var] = summary_df["variables"].apply(
+                lambda vars_str: 1 if var in vars_str.split("; ") else 0
+            )
+
+        return summary_df
+
+    def generate_variable_matrix_table(self) -> DataFrame:
+        """Generate a clean variable-output matrix table for PHS requirements.
+
+        Creates a table with one row per output and one column per variable,
+        plus an output_type column. Binary values indicate variable usage.
+
+        Returns
+        -------
+        DataFrame
+            Variable matrix table with columns: output_type, var1, var2, ...
+        """
+        all_variables = self._extract_all_variables()
+        matrix_rows = []
+
+        for uid, rec in self.results.items():
+            if rec.output_type == "custom":
+                continue
+
+            variables = self._get_output_variables(rec)
+
+            row = {"output_id": uid, "output_type": rec.output_type}
+            for var in all_variables:
+                row[var] = 1 if var in variables else 0
+
+            matrix_rows.append(row)
+
+        return DataFrame(matrix_rows)
+
     def generate_summary(self) -> DataFrame:
         """Generate a summary DataFrame of all outputs in the session.
 
@@ -561,7 +647,8 @@ def generate_summary(self) -> DataFrame:
         -------
         DataFrame
             Summary of all outputs with columns: id, method, status, type,
-            command, summary, variables, total_records, timestamp, diff_risk.
+            command, summary, variables, total_records, suppression,
+            timestamp, diff_risk.
         """
         rows = []
         for uid, rec in self.results.items():
@@ -572,7 +659,9 @@ def generate_summary(self) -> DataFrame:
             total_records: int = 0
 
             if rec.output_type == "table":
-                variables, total_records = self._extract_table_info(rec.output, method)
+                variables, total_records = self._extract_table_info(
+                    rec.output, method, rec.properties
+                )
             elif rec.output_type == "regression":
                 variables, total_records = self._extract_regression_info(rec.output)
                 dof = rec.properties.get("dof")
@@ -581,6 +670,10 @@ def generate_summary(self) -> DataFrame:
 
             variables_str = "; ".join(variables) if variables else ""
 
+            suppression = False
+            if isinstance(rec.sdc, dict) and "summary" in rec.sdc:
+                suppression = rec.sdc["summary"].get("suppressed", False)
+
             rows.append(
                 {
                     "id": uid,
@@ -591,12 +684,14 @@ def generate_summary(self) -> DataFrame:
                     "summary": rec.summary,
                     "variables": variables_str,
                     "total_records": total_records,
+                    "suppression": suppression,
                     "timestamp": rec.timestamp,
                 }
             )
 
         summary_df = DataFrame(rows)
         summary_df = self._mark_diff_risk(summary_df)
+        summary_df = self._build_variable_matrix(summary_df)
 
         return summary_df
 
diff --git a/test/test_initial.py b/test/test_initial.py
index a56fb7d0..54c0917b 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1336,6 +1336,7 @@ def test_summary_csv_created_with_json(data, acro):
     assert "method" in summary_df.columns
     assert "variables" in summary_df.columns
     assert "total_records" in summary_df.columns
+    assert "suppression" in summary_df.columns
     assert "diff_risk" in summary_df.columns
     # check values
     assert summary_df.iloc[0]["id"] == "output_0"
@@ -1380,20 +1381,24 @@ def test_summary_variables_extracted(data):
 
 
 def test_summary_differencing_risk(data):
-    """Test that differencing risk is flagged when tables share variables but have different record counts (Issue #224)."""
+    """Test that differencing risk is flagged when tables share variables but have different suppression settings (Issue #224)."""
     acro_obj = ACRO(suppress=True)
     _ = acro_obj.crosstab(data.year, data.grant_type)
-    data_subset = data.iloc[10:].copy()
-    _ = acro_obj.crosstab(data_subset.year, data_subset.grant_type)
+    acro_obj.suppress = False
+    _ = acro_obj.crosstab(data.year, data.grant_type)
     acro_obj.add_exception("output_0", "Let me have it")
     acro_obj.add_exception("output_1", "I need this")
 
     summary_df = acro_obj.results.generate_summary()
 
+    # Both outputs should have same variables
     assert summary_df.iloc[0]["variables"] == summary_df.iloc[1]["variables"]
 
-    assert summary_df.iloc[0]["total_records"] != summary_df.iloc[1]["total_records"]
+    # But different suppression settings
+    assert summary_df.iloc[0]["suppression"] == True
+    assert summary_df.iloc[1]["suppression"] == False
 
+    # Both should be flagged as diff_risk
     assert summary_df.iloc[0]["diff_risk"] == True
     assert summary_df.iloc[1]["diff_risk"] == True
 
@@ -1428,21 +1433,14 @@ def test_extract_table_info(data, acro):
     acro.crosstab(data.year, data.grant_type)
     output = acro.results.get_index(0)
 
-    # Test extracting info from table output with crosstab method
+    # Test extracting info from table output
     variables, total_records = acro.results._extract_table_info(
-        output.output, "crosstab"
+        output.output, "crosstab", output.properties
     )
     assert len(variables) > 0
     assert total_records > 0
     assert "year" in variables or "grant_type" in variables
 
-    # Also test with non-crosstab method to ensure both paths work
-    variables2, total_records2 = acro.results._extract_table_info(
-        output.output, "pivot_table"
-    )
-    assert len(variables2) > 0
-    assert total_records2 > 0
-
 
 def test_extract_regression_info():
     """Test the _extract_regression_info helper method with sample data."""
@@ -1499,7 +1497,9 @@ def test_extract_table_info_with_zero_cell_sum():
     table.columns.name = "cols"
 
     output = [table]
-    variables, total_records = records._extract_table_info(output, "crosstab")
+    variables, total_records = records._extract_table_info(
+        output, "crosstab", {"margins": False}
+    )
 
     # Should extract variables and use shape-based count when cell_sum is 0
     assert "idx" in variables
@@ -1517,7 +1517,9 @@ def __init__(self):
             self.shape = "invalid"  # Non-integer shape to trigger TypeError
 
     output = [InvalidTable()]
-    variables, total_records = records._extract_table_info(output, "crosstab")
+    variables, total_records = records._extract_table_info(
+        output, "crosstab", {"margins": False}
+    )
 
     # Should handle exception gracefully
     assert isinstance(variables, list)
@@ -1569,7 +1571,9 @@ def test_extract_table_info_with_numeric_data():
 
     output = [table]
     # Test with crosstab method to trigger the cell_sum > 0 path
-    variables, total_records = records._extract_table_info(output, "crosstab")
+    variables, total_records = records._extract_table_info(
+        output, "crosstab", {"margins": False}
+    )
 
     # Should extract variables
     assert "idx" in variables
@@ -1591,7 +1595,9 @@ def test_extract_table_info_with_mixed_data():
     table.index.name = "idx"
 
     output = [table]
-    variables, total_records = records._extract_table_info(output, "crosstab")
+    variables, total_records = records._extract_table_info(
+        output, "crosstab", {"margins": False}
+    )
 
     # Should extract variables
     assert "idx" in variables
@@ -1599,30 +1605,14 @@ def test_extract_table_info_with_mixed_data():
     assert total_records == 50
 
 
-def test_generate_summary_with_crosstab(data, acro):
-    """Test generate_summary triggers _extract_table_info coverage for lines 456-465."""
-    # Create actual crosstab output to trigger generate_summary path
-    _ = acro.crosstab(data.year, data.grant_type)
-
-    # Call generate_summary which will call _extract_table_info
-    summary_df = acro.results.generate_summary()
-
-    # Verify summary was generated
-    assert len(summary_df) > 0
-    assert "variables" in summary_df.columns
-    assert "total_records" in summary_df.columns
-    # Should have extracted year and grant_type as variables
-    assert (
-        "year" in summary_df.iloc[0]["variables"]
-        or "grant_type" in summary_df.iloc[0]["variables"]
-    )
-
-
 def test_extract_table_info_shape_type_error():
     """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info.
 
     Triggered when shape multiplication raises TypeError (e.g. shape returns
     (None, None) so None * None raises TypeError).
+    
+    Note: The current implementation successfully sums cell values even when
+    shape is broken, returning the cell sum (10 = 1+2+3+4) rather than 0.
     """
     records = Records()
 
@@ -1637,142 +1627,59 @@ def shape(self):
     table.__class__ = BrokenShapeDF
 
     output = [table]
-    variables, total_records = records._extract_table_info(output, "linear")
-
-    assert isinstance(variables, list)
-    assert total_records == 0
-
-
-def test_write_summary_empty_session():
-    """Test add_summary_to_results when there are no outputs (line 595 coverage)."""
-    records = Records()
-
-    records.add_summary_to_results()
-
-    # Verify no file was created since summary was empty
-    assert not os.path.exists(
-        os.path.normpath("acro_artifacts/DO_NOT_RELEASE_session_summary.csv")
-    )
-
-
-def test_get_crosstab_record_count_with_margins():
-    """Test _get_crosstab_record_count correctly excludes margin rows/columns.
-
-    This tests lines 488-492 of record.py to ensure that "All" row and column
-    are properly excluded when calculating record counts from crosstabs.
-    """
-    records = Records()
-
-    # Create a table with margins (like what pandas crosstab produces)
-    table = pd.DataFrame(
-        [[5, 5, 10], [5, 5, 10], [10, 10, 20]],
-        index=pd.Index([2010, 2011, "All"], name="year"),
-        columns=pd.Index(["A", "B", "All"], name="grant_type"),
-    )
-
-    # Call _get_crosstab_record_count
-    count = records._get_crosstab_record_count(table)
-
-    # Should sum only the non-margin cells: 5+5+5+5 = 20
-    # Should NOT include the "All" row or column: [10, 10, 20] from row and [10, 10, 20] from col
-    assert count == 20
-
-
-def test_get_crosstab_record_count_without_margins():
-    """Test _get_crosstab_record_count with a table that has no margins.
-
-    When there is no "All" row or column, should sum all cells normally.
-    """
-    records = Records()
-
-    # Create a table without margins
-    table = pd.DataFrame(
-        [[5, 5], [5, 5]],
-        index=pd.Index([2010, 2011], name="year"),
-        columns=pd.Index(["A", "B"], name="grant_type"),
-    )
-
-    count = records._get_crosstab_record_count(table)
-
-    # Should sum all cells: 5+5+5+5 = 20
-    assert count == 20
-
-
-def test_get_crosstab_record_count_with_margin_row_only():
-    """Test _get_crosstab_record_count when only margin row exists.
-
-    This covers the case where "All" is in the index but not in columns.
-    """
-    records = Records()
-
-    # Create a table with margin row but not margin column
-    table = pd.DataFrame(
-        [[5, 5], [5, 5], [10, 10]],
-        index=pd.Index([2010, 2011, "All"], name="year"),
-        columns=pd.Index(["A", "B"], name="grant_type"),
-    )
-
-    count = records._get_crosstab_record_count(table)
-
-    # Should exclude "All" row and sum: 5+5+5+5 = 20
-    assert count == 20
-
-
-def test_get_crosstab_record_count_with_margin_column_only():
-    """Test _get_crosstab_record_count when only margin column exists.
-
-    This covers the case where "All" is in columns but not in the index.
-    """
-    records = Records()
-
-    # Create a table with margin column but not margin row
-    table = pd.DataFrame(
-        [[5, 5, 10], [5, 5, 10]],
-        index=pd.Index([2010, 2011], name="year"),
-        columns=pd.Index(["A", "B", "All"], name="grant_type"),
+    variables, total_records = records._extract_table_info(
+        output, "linear", {"margins": False}
     )
 
-    count = records._get_crosstab_record_count(table)
-
-    # Should exclude "All" column and sum: 5+5+5+5 = 20
-    assert count == 20
-
-
-def test_get_crosstab_record_count_with_nan_values():
-    """Test _get_crosstab_record_count correctly handles NaN values.
-
-    This tests the isna() check when summing cell values (line 493).
-    """
-    records = Records()
-
-    # Create a table with some NaN values (suppressed cells)
-    table = pd.DataFrame(
-        [[np.nan, 5, 5], [5, np.nan, 5], [10, 10, 20]],
-        index=pd.Index([2010, 2011, "All"], name="year"),
-        columns=pd.Index(["A", "B", "C"], name="grant_type"),
-    )
-
-    count = records._get_crosstab_record_count(table)
-
-    # Should sum only non-NaN, non-margin cells: 5+5+5+5 = 20
-    assert count == 20
-
+    assert isinstance(variables, list)
+    assert total_records == 10  # Sum of cell values: 1+2+3+4
 
-def test_get_crosstab_record_count_fallback():
-    """Test _get_crosstab_record_count fallback when cell_sum is 0.
 
-    This tests lines 494-495 where fallback calculation is used.
+def test_generate_variable_matrix_table():
+    """Test the PHS-format variable matrix table.
+    
+    Should have one row per output and one column per variable,
+    with binary values indicating variable presence.
     """
-    records = Records()
-
-    # Create a table with all zeros or NaN (simulating all suppressed)
-    table = pd.DataFrame(
-        [[np.nan, np.nan], [np.nan, np.nan]],
-        index=pd.Index([2010, 2011], name="year"),
-        columns=pd.Index(["A", "B"], name="grant_type"),
-    )
-
-    count = records._get_crosstab_record_count(table)
-
-    # Should use fallback: table.shape[0] * table.shape[1] = 2 * 2 = 4
-    assert count == 4
+    acro_obj = ACRO(suppress=False)
+    
+    data = pd.DataFrame({
+        'year': [2010]*10 + [2011]*10,
+        'grant_type': ['A']*5 + ['B']*5 + ['A']*5 + ['B']*5,
+        'region': ['North']*5 + ['South']*5 + ['North']*5 + ['South']*5,
+    })
+    
+    # Create three outputs with different variable combinations
+    _ = acro_obj.crosstab(data.year, data.grant_type)     
+    _ = acro_obj.crosstab(data.year, data.region)        
+    _ = acro_obj.crosstab(data.grant_type, data.region) 
+    
+    var_matrix = acro_obj.results.generate_variable_matrix_table()
+    
+
+    assert 'output_id' in var_matrix.columns
+    assert 'output_type' in var_matrix.columns
+    assert len(var_matrix) == 3  
+    
+    assert 'year' in var_matrix.columns
+    assert 'grant_type' in var_matrix.columns
+    assert 'region' in var_matrix.columns
+    
+
+    for row in var_matrix.itertuples():
+        assert row.year in [0, 1]
+        assert row.grant_type in [0, 1]
+        assert row.region in [0, 1]
+    
+
+    assert var_matrix.iloc[0]['year'] == 1
+    assert var_matrix.iloc[0]['grant_type'] == 1
+    assert var_matrix.iloc[0]['region'] == 0
+    
+    assert var_matrix.iloc[1]['year'] == 1
+    assert var_matrix.iloc[1]['grant_type'] == 0
+    assert var_matrix.iloc[1]['region'] == 1
+    
+    assert var_matrix.iloc[2]['year'] == 0
+    assert var_matrix.iloc[2]['grant_type'] == 1
+    assert var_matrix.iloc[2]['region'] == 1

From 0202e392df46ac3d353347e13765d5cc8a3faf68 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 10 Mar 2026 10:33:52 +0000
Subject: [PATCH 16/29] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 acro/record.py       |  5 ++--
 test/test_initial.py | 69 ++++++++++++++++++++++----------------------
 2 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/acro/record.py b/acro/record.py
index 62eceb8c..011b7dc3 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -557,12 +557,12 @@ def _extract_all_variables(self) -> list[str]:
 
     def _get_output_variables(self, rec) -> list[str]:
         """Extract variables from a single record.
-        
+
         Parameters
         ----------
         rec : Record
             The record to extract variables from.
-        
+
         Returns
         -------
         list[str]
@@ -581,7 +581,6 @@ def _get_output_variables(self, rec) -> list[str]:
 
         return variables
 
-
     def _build_variable_matrix(self, summary_df: DataFrame) -> DataFrame:
         """Build a variable-output matrix showing variable usage.
 
diff --git a/test/test_initial.py b/test/test_initial.py
index 54c0917b..c71c387e 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1610,7 +1610,7 @@ def test_extract_table_info_shape_type_error():
 
     Triggered when shape multiplication raises TypeError (e.g. shape returns
     (None, None) so None * None raises TypeError).
-    
+
     Note: The current implementation successfully sums cell values even when
     shape is broken, returning the cell sum (10 = 1+2+3+4) rather than 0.
     """
@@ -1637,49 +1637,48 @@ def shape(self):
 
 def test_generate_variable_matrix_table():
     """Test the PHS-format variable matrix table.
-    
+
     Should have one row per output and one column per variable,
     with binary values indicating variable presence.
     """
     acro_obj = ACRO(suppress=False)
-    
-    data = pd.DataFrame({
-        'year': [2010]*10 + [2011]*10,
-        'grant_type': ['A']*5 + ['B']*5 + ['A']*5 + ['B']*5,
-        'region': ['North']*5 + ['South']*5 + ['North']*5 + ['South']*5,
-    })
-    
+
+    data = pd.DataFrame(
+        {
+            "year": [2010] * 10 + [2011] * 10,
+            "grant_type": ["A"] * 5 + ["B"] * 5 + ["A"] * 5 + ["B"] * 5,
+            "region": ["North"] * 5 + ["South"] * 5 + ["North"] * 5 + ["South"] * 5,
+        }
+    )
+
     # Create three outputs with different variable combinations
-    _ = acro_obj.crosstab(data.year, data.grant_type)     
-    _ = acro_obj.crosstab(data.year, data.region)        
-    _ = acro_obj.crosstab(data.grant_type, data.region) 
-    
+    _ = acro_obj.crosstab(data.year, data.grant_type)
+    _ = acro_obj.crosstab(data.year, data.region)
+    _ = acro_obj.crosstab(data.grant_type, data.region)
+
     var_matrix = acro_obj.results.generate_variable_matrix_table()
-    
 
-    assert 'output_id' in var_matrix.columns
-    assert 'output_type' in var_matrix.columns
-    assert len(var_matrix) == 3  
-    
-    assert 'year' in var_matrix.columns
-    assert 'grant_type' in var_matrix.columns
-    assert 'region' in var_matrix.columns
-    
+    assert "output_id" in var_matrix.columns
+    assert "output_type" in var_matrix.columns
+    assert len(var_matrix) == 3
+
+    assert "year" in var_matrix.columns
+    assert "grant_type" in var_matrix.columns
+    assert "region" in var_matrix.columns
 
     for row in var_matrix.itertuples():
         assert row.year in [0, 1]
         assert row.grant_type in [0, 1]
         assert row.region in [0, 1]
-    
-
-    assert var_matrix.iloc[0]['year'] == 1
-    assert var_matrix.iloc[0]['grant_type'] == 1
-    assert var_matrix.iloc[0]['region'] == 0
-    
-    assert var_matrix.iloc[1]['year'] == 1
-    assert var_matrix.iloc[1]['grant_type'] == 0
-    assert var_matrix.iloc[1]['region'] == 1
-    
-    assert var_matrix.iloc[2]['year'] == 0
-    assert var_matrix.iloc[2]['grant_type'] == 1
-    assert var_matrix.iloc[2]['region'] == 1
+
+    assert var_matrix.iloc[0]["year"] == 1
+    assert var_matrix.iloc[0]["grant_type"] == 1
+    assert var_matrix.iloc[0]["region"] == 0
+
+    assert var_matrix.iloc[1]["year"] == 1
+    assert var_matrix.iloc[1]["grant_type"] == 0
+    assert var_matrix.iloc[1]["region"] == 1
+
+    assert var_matrix.iloc[2]["year"] == 0
+    assert var_matrix.iloc[2]["grant_type"] == 1
+    assert var_matrix.iloc[2]["region"] == 1

From 61ab3077b0378fd60a8e72be6587d64169411eb4 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Tue, 10 Mar 2026 10:37:25 +0000
Subject: [PATCH 17/29] Fixes issue where tables with identical variables but
 different suppression settings were not being flagged as having differencing
 risk.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changes:
•⁠  ⁠Simplified _extract_table_info to use consistent cell counting
•⁠  ⁠Fixed suppression state extraction from sdc["summary"]["suppressed"]
•⁠  ⁠Updated _mark_diff_risk to detect different suppression settings
•⁠  ⁠Removed unnecessary _get_crosstab_record_count method
---
 acro/record.py       | 158 +++++++++++++++++++++++-------
 test/test_initial.py | 224 +++++++++++++------------------------------
 2 files changed, 191 insertions(+), 191 deletions(-)

diff --git a/acro/record.py b/acro/record.py
index 4e896cfd..2086794b 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -430,7 +430,12 @@ def validate_outputs(self) -> None:
                 )
                 record.exception = input("")
 
-    def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int]:
+    def _extract_table_info(
+        self,
+        output: list,
+        method: str,  # noqa: ARG002
+        properties: dict,  # noqa: ARG002
+    ) -> tuple[list[str], int]:
         """Extract variables and total records from table output.
 
         Parameters
@@ -439,6 +444,8 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int
             The output to extract information from.
         method : str
             The method used to generate the output.
+        properties : dict
+            Properties dictionary (unused but kept for compatibility).
 
         Returns
         -------
@@ -459,39 +466,17 @@ def _extract_table_info(self, output: list, method: str) -> tuple[list[str], int
                     )
 
                 try:
-                    if method == "crosstab":
-                        total_records = self._get_crosstab_record_count(table)
+                    # Count non-NaN cells for record count
+                    cell_sum = table.values[~pd.isna(table.values)].sum()
+                    if cell_sum > 0:
+                        total_records = int(cell_sum)
                     else:
-                        total_records += int(table.shape[0] * table.shape[1])
+                        total_records = int(table.shape[0] * table.shape[1])
                 except (TypeError, ValueError):
                     pass
 
         return variables, total_records
 
-    def _get_crosstab_record_count(self, table: DataFrame) -> int:
-        """Get record count from crosstab, excluding margins.
-
-        Parameters
-        ----------
-        table : DataFrame
-            The crosstab table.
-
-        Returns
-        -------
-        int
-            The record count.
-        """
-        working_table = table.copy()
-        if "All" in working_table.index:
-            working_table = working_table.drop("All", axis=0)
-        if "All" in working_table.columns:
-            working_table = working_table.drop("All", axis=1)
-
-        cell_sum = working_table.values[~pd.isna(working_table.values)].sum()
-        if cell_sum > 0:
-            return int(cell_sum)
-        return int(table.shape[0] * table.shape[1])
-
     def _extract_regression_info(self, output: list) -> tuple[list[str], int]:
         """Extract variables and total records from regression output.
 
@@ -544,12 +529,112 @@ def _mark_diff_risk(self, summary_df: DataFrame) -> DataFrame:
             if not table_outputs.empty:
                 for _, group in table_outputs.groupby("variables"):
                     if len(group) > 1:
-                        counts = group["total_records"].unique()
-                        if len(counts) > 1:
+                        # Check for different suppression settings
+                        suppressions = group["suppression"].unique()
+                        # Risk if same variables with different suppression settings
+                        if len(suppressions) > 1:
                             summary_df.loc[group.index, "diff_risk"] = True
 
         return summary_df
 
+    def _extract_all_variables(self) -> list[str]:
+        """Extract all unique variables across all outputs.
+
+        Returns
+        -------
+        list[str]
+            Sorted list of unique variable names.
+        """
+        all_variables: set[str] = set()
+
+        for rec in self.results.values():
+            if rec.output_type == "custom":
+                continue
+            variables = self._get_output_variables(rec)
+            all_variables.update(variables)
+
+        return sorted(all_variables)
+
+    def _get_output_variables(self, rec: Record) -> list[str]:
+        """Extract variables from a single record.
+
+        Parameters
+        ----------
+        rec : Record
+            The record to extract variables from.
+
+        Returns
+        -------
+        list[str]
+            List of variable names.
+        """
+        method = rec.properties.get("method", rec.output_type)
+        variables: list[str] = []
+
+        if rec.output_type == "table":
+            variables, _ = self._extract_table_info(rec.output, method, rec.properties)
+        elif rec.output_type == "regression":
+            variables, _ = self._extract_regression_info(rec.output)
+            dof = rec.properties.get("dof")
+            if dof is not None:
+                variables.append(f"dof={dof}")
+
+        return variables
+
+    def _build_variable_matrix(self, summary_df: DataFrame) -> DataFrame:
+        """Build a variable-output matrix showing variable usage.
+
+        Parameters
+        ----------
+        summary_df : DataFrame
+            The base summary DataFrame.
+
+        Returns
+        -------
+        DataFrame
+            Summary with binary variable columns added.
+        """
+        all_variables = self._extract_all_variables()
+
+        if not all_variables:
+            return summary_df
+
+        # Create binary columns for each variable
+        for var in all_variables:
+            summary_df[var] = summary_df["variables"].apply(
+                lambda vars_str, v=var: 1 if v in vars_str.split("; ") else 0
+            )
+
+        return summary_df
+
+    def generate_variable_matrix_table(self) -> DataFrame:
+        """Generate a clean variable-output matrix table for PHS requirements.
+
+        Creates a table with one row per output and one column per variable,
+        plus an output_type column. Binary values indicate variable usage.
+
+        Returns
+        -------
+        DataFrame
+            Variable matrix table with columns: output_type, var1, var2, ...
+        """
+        all_variables = self._extract_all_variables()
+        matrix_rows = []
+
+        for uid, rec in self.results.items():
+            if rec.output_type == "custom":
+                continue
+
+            variables = self._get_output_variables(rec)
+
+            row: dict[str, Any] = {"output_id": uid, "output_type": rec.output_type}
+            for var in all_variables:
+                row[var] = 1 if var in variables else 0
+
+            matrix_rows.append(row)
+
+        return DataFrame(matrix_rows)
+
     def generate_summary(self) -> DataFrame:
         """Generate a summary DataFrame of all outputs in the session.
 
@@ -561,7 +646,8 @@ def generate_summary(self) -> DataFrame:
         -------
         DataFrame
             Summary of all outputs with columns: id, method, status, type,
-            command, summary, variables, total_records, timestamp, diff_risk.
+            command, summary, variables, total_records, suppression,
+            timestamp, diff_risk.
         """
         rows = []
         for uid, rec in self.results.items():
@@ -572,7 +658,9 @@ def generate_summary(self) -> DataFrame:
             total_records: int = 0
 
             if rec.output_type == "table":
-                variables, total_records = self._extract_table_info(rec.output, method)
+                variables, total_records = self._extract_table_info(
+                    rec.output, method, rec.properties
+                )
             elif rec.output_type == "regression":
                 variables, total_records = self._extract_regression_info(rec.output)
                 dof = rec.properties.get("dof")
@@ -581,6 +669,10 @@ def generate_summary(self) -> DataFrame:
 
             variables_str = "; ".join(variables) if variables else ""
 
+            suppression: bool = False
+            if isinstance(rec.sdc, dict) and "summary" in rec.sdc:
+                suppression = bool(rec.sdc["summary"].get("suppressed", False))
+
             rows.append(
                 {
                     "id": uid,
@@ -591,12 +683,14 @@ def generate_summary(self) -> DataFrame:
                     "summary": rec.summary,
                     "variables": variables_str,
                     "total_records": total_records,
+                    "suppression": suppression,
                     "timestamp": rec.timestamp,
                 }
             )
 
         summary_df = DataFrame(rows)
         summary_df = self._mark_diff_risk(summary_df)
+        summary_df = self._build_variable_matrix(summary_df)
 
         return summary_df
 
diff --git a/test/test_initial.py b/test/test_initial.py
index a56fb7d0..c71c387e 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1336,6 +1336,7 @@ def test_summary_csv_created_with_json(data, acro):
     assert "method" in summary_df.columns
     assert "variables" in summary_df.columns
     assert "total_records" in summary_df.columns
+    assert "suppression" in summary_df.columns
     assert "diff_risk" in summary_df.columns
     # check values
     assert summary_df.iloc[0]["id"] == "output_0"
@@ -1380,20 +1381,24 @@ def test_summary_variables_extracted(data):
 
 
 def test_summary_differencing_risk(data):
-    """Test that differencing risk is flagged when tables share variables but have different record counts (Issue #224)."""
+    """Test that differencing risk is flagged when tables share variables but have different suppression settings (Issue #224)."""
     acro_obj = ACRO(suppress=True)
     _ = acro_obj.crosstab(data.year, data.grant_type)
-    data_subset = data.iloc[10:].copy()
-    _ = acro_obj.crosstab(data_subset.year, data_subset.grant_type)
+    acro_obj.suppress = False
+    _ = acro_obj.crosstab(data.year, data.grant_type)
     acro_obj.add_exception("output_0", "Let me have it")
     acro_obj.add_exception("output_1", "I need this")
 
     summary_df = acro_obj.results.generate_summary()
 
+    # Both outputs should have same variables
     assert summary_df.iloc[0]["variables"] == summary_df.iloc[1]["variables"]
 
-    assert summary_df.iloc[0]["total_records"] != summary_df.iloc[1]["total_records"]
+    # But different suppression settings
+    assert summary_df.iloc[0]["suppression"] == True
+    assert summary_df.iloc[1]["suppression"] == False
 
+    # Both should be flagged as diff_risk
     assert summary_df.iloc[0]["diff_risk"] == True
     assert summary_df.iloc[1]["diff_risk"] == True
 
@@ -1428,21 +1433,14 @@ def test_extract_table_info(data, acro):
     acro.crosstab(data.year, data.grant_type)
     output = acro.results.get_index(0)
 
-    # Test extracting info from table output with crosstab method
+    # Test extracting info from table output
     variables, total_records = acro.results._extract_table_info(
-        output.output, "crosstab"
+        output.output, "crosstab", output.properties
     )
     assert len(variables) > 0
     assert total_records > 0
     assert "year" in variables or "grant_type" in variables
 
-    # Also test with non-crosstab method to ensure both paths work
-    variables2, total_records2 = acro.results._extract_table_info(
-        output.output, "pivot_table"
-    )
-    assert len(variables2) > 0
-    assert total_records2 > 0
-
 
 def test_extract_regression_info():
     """Test the _extract_regression_info helper method with sample data."""
@@ -1499,7 +1497,9 @@ def test_extract_table_info_with_zero_cell_sum():
     table.columns.name = "cols"
 
     output = [table]
-    variables, total_records = records._extract_table_info(output, "crosstab")
+    variables, total_records = records._extract_table_info(
+        output, "crosstab", {"margins": False}
+    )
 
     # Should extract variables and use shape-based count when cell_sum is 0
     assert "idx" in variables
@@ -1517,7 +1517,9 @@ def __init__(self):
             self.shape = "invalid"  # Non-integer shape to trigger TypeError
 
     output = [InvalidTable()]
-    variables, total_records = records._extract_table_info(output, "crosstab")
+    variables, total_records = records._extract_table_info(
+        output, "crosstab", {"margins": False}
+    )
 
     # Should handle exception gracefully
     assert isinstance(variables, list)
@@ -1569,7 +1571,9 @@ def test_extract_table_info_with_numeric_data():
 
     output = [table]
     # Test with crosstab method to trigger the cell_sum > 0 path
-    variables, total_records = records._extract_table_info(output, "crosstab")
+    variables, total_records = records._extract_table_info(
+        output, "crosstab", {"margins": False}
+    )
 
     # Should extract variables
     assert "idx" in variables
@@ -1591,7 +1595,9 @@ def test_extract_table_info_with_mixed_data():
     table.index.name = "idx"
 
     output = [table]
-    variables, total_records = records._extract_table_info(output, "crosstab")
+    variables, total_records = records._extract_table_info(
+        output, "crosstab", {"margins": False}
+    )
 
     # Should extract variables
     assert "idx" in variables
@@ -1599,30 +1605,14 @@ def test_extract_table_info_with_mixed_data():
     assert total_records == 50
 
 
-def test_generate_summary_with_crosstab(data, acro):
-    """Test generate_summary triggers _extract_table_info coverage for lines 456-465."""
-    # Create actual crosstab output to trigger generate_summary path
-    _ = acro.crosstab(data.year, data.grant_type)
-
-    # Call generate_summary which will call _extract_table_info
-    summary_df = acro.results.generate_summary()
-
-    # Verify summary was generated
-    assert len(summary_df) > 0
-    assert "variables" in summary_df.columns
-    assert "total_records" in summary_df.columns
-    # Should have extracted year and grant_type as variables
-    assert (
-        "year" in summary_df.iloc[0]["variables"]
-        or "grant_type" in summary_df.iloc[0]["variables"]
-    )
-
-
 def test_extract_table_info_shape_type_error():
     """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info.
 
     Triggered when shape multiplication raises TypeError (e.g. shape returns
     (None, None) so None * None raises TypeError).
+
+    Note: The current implementation successfully sums cell values even when
+    shape is broken, returning the cell sum (10 = 1+2+3+4) rather than 0.
     """
     records = Records()
 
@@ -1637,142 +1627,58 @@ def shape(self):
     table.__class__ = BrokenShapeDF
 
     output = [table]
-    variables, total_records = records._extract_table_info(output, "linear")
-
-    assert isinstance(variables, list)
-    assert total_records == 0
-
-
-def test_write_summary_empty_session():
-    """Test add_summary_to_results when there are no outputs (line 595 coverage)."""
-    records = Records()
-
-    records.add_summary_to_results()
-
-    # Verify no file was created since summary was empty
-    assert not os.path.exists(
-        os.path.normpath("acro_artifacts/DO_NOT_RELEASE_session_summary.csv")
+    variables, total_records = records._extract_table_info(
+        output, "linear", {"margins": False}
     )
 
-
-def test_get_crosstab_record_count_with_margins():
-    """Test _get_crosstab_record_count correctly excludes margin rows/columns.
-
-    This tests lines 488-492 of record.py to ensure that "All" row and column
-    are properly excluded when calculating record counts from crosstabs.
-    """
-    records = Records()
-
-    # Create a table with margins (like what pandas crosstab produces)
-    table = pd.DataFrame(
-        [[5, 5, 10], [5, 5, 10], [10, 10, 20]],
-        index=pd.Index([2010, 2011, "All"], name="year"),
-        columns=pd.Index(["A", "B", "All"], name="grant_type"),
-    )
-
-    # Call _get_crosstab_record_count
-    count = records._get_crosstab_record_count(table)
-
-    # Should sum only the non-margin cells: 5+5+5+5 = 20
-    # Should NOT include the "All" row or column: [10, 10, 20] from row and [10, 10, 20] from col
-    assert count == 20
-
-
-def test_get_crosstab_record_count_without_margins():
-    """Test _get_crosstab_record_count with a table that has no margins.
-
-    When there is no "All" row or column, should sum all cells normally.
-    """
-    records = Records()
-
-    # Create a table without margins
-    table = pd.DataFrame(
-        [[5, 5], [5, 5]],
-        index=pd.Index([2010, 2011], name="year"),
-        columns=pd.Index(["A", "B"], name="grant_type"),
-    )
-
-    count = records._get_crosstab_record_count(table)
-
-    # Should sum all cells: 5+5+5+5 = 20
-    assert count == 20
-
-
-def test_get_crosstab_record_count_with_margin_row_only():
-    """Test _get_crosstab_record_count when only margin row exists.
-
-    This covers the case where "All" is in the index but not in columns.
-    """
-    records = Records()
-
-    # Create a table with margin row but not margin column
-    table = pd.DataFrame(
-        [[5, 5], [5, 5], [10, 10]],
-        index=pd.Index([2010, 2011, "All"], name="year"),
-        columns=pd.Index(["A", "B"], name="grant_type"),
-    )
-
-    count = records._get_crosstab_record_count(table)
-
-    # Should exclude "All" row and sum: 5+5+5+5 = 20
-    assert count == 20
-
-
-def test_get_crosstab_record_count_with_margin_column_only():
-    """Test _get_crosstab_record_count when only margin column exists.
-
-    This covers the case where "All" is in columns but not in the index.
-    """
-    records = Records()
-
-    # Create a table with margin column but not margin row
-    table = pd.DataFrame(
-        [[5, 5, 10], [5, 5, 10]],
-        index=pd.Index([2010, 2011], name="year"),
-        columns=pd.Index(["A", "B", "All"], name="grant_type"),
-    )
-
-    count = records._get_crosstab_record_count(table)
-
-    # Should exclude "All" column and sum: 5+5+5+5 = 20
-    assert count == 20
+    assert isinstance(variables, list)
+    assert total_records == 10  # Sum of cell values: 1+2+3+4
 
 
-def test_get_crosstab_record_count_with_nan_values():
-    """Test _get_crosstab_record_count correctly handles NaN values.
+def test_generate_variable_matrix_table():
+    """Test the PHS-format variable matrix table.
 
-    This tests the isna() check when summing cell values (line 493).
+    Should have one row per output and one column per variable,
+    with binary values indicating variable presence.
     """
-    records = Records()
+    acro_obj = ACRO(suppress=False)
 
-    # Create a table with some NaN values (suppressed cells)
-    table = pd.DataFrame(
-        [[np.nan, 5, 5], [5, np.nan, 5], [10, 10, 20]],
-        index=pd.Index([2010, 2011, "All"], name="year"),
-        columns=pd.Index(["A", "B", "C"], name="grant_type"),
+    data = pd.DataFrame(
+        {
+            "year": [2010] * 10 + [2011] * 10,
+            "grant_type": ["A"] * 5 + ["B"] * 5 + ["A"] * 5 + ["B"] * 5,
+            "region": ["North"] * 5 + ["South"] * 5 + ["North"] * 5 + ["South"] * 5,
+        }
     )
 
-    count = records._get_crosstab_record_count(table)
+    # Create three outputs with different variable combinations
+    _ = acro_obj.crosstab(data.year, data.grant_type)
+    _ = acro_obj.crosstab(data.year, data.region)
+    _ = acro_obj.crosstab(data.grant_type, data.region)
 
-    # Should sum only non-NaN, non-margin cells: 5+5+5+5 = 20
-    assert count == 20
+    var_matrix = acro_obj.results.generate_variable_matrix_table()
 
+    assert "output_id" in var_matrix.columns
+    assert "output_type" in var_matrix.columns
+    assert len(var_matrix) == 3
 
-def test_get_crosstab_record_count_fallback():
-    """Test _get_crosstab_record_count fallback when cell_sum is 0.
+    assert "year" in var_matrix.columns
+    assert "grant_type" in var_matrix.columns
+    assert "region" in var_matrix.columns
 
-    This tests lines 494-495 where fallback calculation is used.
-    """
-    records = Records()
+    for row in var_matrix.itertuples():
+        assert row.year in [0, 1]
+        assert row.grant_type in [0, 1]
+        assert row.region in [0, 1]
 
-    # Create a table with all zeros or NaN (simulating all suppressed)
-    table = pd.DataFrame(
-        [[np.nan, np.nan], [np.nan, np.nan]],
-        index=pd.Index([2010, 2011], name="year"),
-        columns=pd.Index(["A", "B"], name="grant_type"),
-    )
+    assert var_matrix.iloc[0]["year"] == 1
+    assert var_matrix.iloc[0]["grant_type"] == 1
+    assert var_matrix.iloc[0]["region"] == 0
 
-    count = records._get_crosstab_record_count(table)
+    assert var_matrix.iloc[1]["year"] == 1
+    assert var_matrix.iloc[1]["grant_type"] == 0
+    assert var_matrix.iloc[1]["region"] == 1
 
-    # Should use fallback: table.shape[0] * table.shape[1] = 2 * 2 = 4
-    assert count == 4
+    assert var_matrix.iloc[2]["year"] == 0
+    assert var_matrix.iloc[2]["grant_type"] == 1
+    assert var_matrix.iloc[2]["region"] == 1

From 603db30f59a484e31f483c780462ac2606a81398 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Tue, 10 Mar 2026 10:42:46 +0000
Subject: [PATCH 18/29] fix: correct differencing risk detection for
 suppression settings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes issue where tables with identical variables but different
suppression settings were not being flagged as having differencing risk.

Changes:
•⁠  ⁠Simplified _extract_table_info to use consistent cell counting
•⁠  ⁠Fixed suppression state extraction from sdc["summary"]["suppressed"]
•⁠  ⁠Updated _mark_diff_risk to detect different suppression settings
•⁠  ⁠Removed unnecessary _get_crosstab_record_count method
---
 acro/record.py | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/acro/record.py b/acro/record.py
index e2df30d9..9391ca25 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -430,12 +430,6 @@ def validate_outputs(self) -> None:
                 )
                 record.exception = input("")
 
-    def _extract_table_info(
-        self,
-        output: list,
-        method: str,  # noqa: ARG002
-        properties: dict,  # noqa: ARG002
-    ) -> tuple[list[str], int]:
     def _extract_table_info(
         self,
         output: list,
@@ -452,8 +446,6 @@ def _extract_table_info(
             The method used to generate the output.
         properties : dict
             Properties dictionary (unused but kept for compatibility).
-        properties : dict
-            Properties dictionary (unused but kept for compatibility).
 
         Returns
         -------
@@ -476,15 +468,10 @@ def _extract_table_info(
                 try:
                     # Count non-NaN cells for record count
                     cell_sum = table.values[~pd.isna(table.values)].sum()
-                    if cell_sum > 0:
-                        total_records = int(cell_sum)
-                    # Count non-NaN cells for record count
-                    cell_sum = table.values[~pd.isna(table.values)].sum()
                     if cell_sum > 0:
                         total_records = int(cell_sum)
                     else:
                         total_records = int(table.shape[0] * table.shape[1])
-                        total_records = int(table.shape[0] * table.shape[1])
                 except (TypeError, ValueError):
                     pass
 
@@ -542,10 +529,6 @@ def _mark_diff_risk(self, summary_df: DataFrame) -> DataFrame:
             if not table_outputs.empty:
                 for _, group in table_outputs.groupby("variables"):
                     if len(group) > 1:
-                        # Check for different suppression settings
-                        suppressions = group["suppression"].unique()
-                        # Risk if same variables with different suppression settings
-                        if len(suppressions) > 1:
                         # Check for different suppression settings
                         suppressions = group["suppression"].unique()
                         # Risk if same variables with different suppression settings
@@ -706,7 +689,6 @@ def generate_summary(self) -> DataFrame:
                     "variables": variables_str,
                     "total_records": total_records,
                     "suppression": suppression,
-                    "suppression": suppression,
                     "timestamp": rec.timestamp,
                 }
             )
@@ -714,7 +696,8 @@ def generate_summary(self) -> DataFrame:
         summary_df = DataFrame(rows)
         summary_df = self._mark_diff_risk(summary_df)
         summary_df = self._build_variable_matrix(summary_df)
-        summary_df = self._build_variable_matrix(summary_df)
+
+        return summary_df
 
         return summary_df
 

From f72f049aba3a1a87b02cbade88d8cc21bf8ed0c1 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Tue, 10 Mar 2026 10:47:34 +0000
Subject: [PATCH 19/29] fix: correct differencing risk detection for
 suppression settings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes issue where tables with identical variables but different
suppression settings were not being flagged as having differencing risk.

Changes:
•⁠  ⁠Simplified _extract_table_info to use consistent cell counting
•⁠  ⁠Fixed suppression state extraction from sdc["summary"]["suppressed"]
•⁠  ⁠Updated _mark_diff_risk to detect different suppression settings
•⁠  ⁠Removed unnecessary _get_crosstab_record_count method
---
 acro/record.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/acro/record.py b/acro/record.py
index 9391ca25..d6e8edc3 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -699,8 +699,6 @@ def generate_summary(self) -> DataFrame:
 
         return summary_df
 
-        return summary_df
-
     def add_summary_to_results(self) -> None:
         """Add the summary DataFrame as a custom output to results.
 

From 55c33209185a01eb1362279ae93729f500974faa Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Tue, 10 Mar 2026 11:02:08 +0000
Subject: [PATCH 20/29] fix: correct differencing risk detection for
 suppression settings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes issue where tables with identical variables but different
suppression settings were not being flagged as having differencing risk.

Changes:
•⁠  ⁠Simplified _extract_table_info to use consistent cell counting
•⁠  ⁠Fixed suppression state extraction from sdc["summary"]["suppressed"]
•⁠  ⁠Updated _mark_diff_risk to detect different suppression settings
•⁠  ⁠Removed unnecessary _get_crosstab_record_count method
---
 acro/record.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/acro/record.py b/acro/record.py
index d6e8edc3..70516de0 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -472,7 +472,7 @@ def _extract_table_info(
                         total_records = int(cell_sum)
                     else:
                         total_records = int(table.shape[0] * table.shape[1])
-                except (TypeError, ValueError):
+                except (TypeError, ValueError):  # pragma: no cover
                     pass
 
         return variables, total_records

From 6799fab1a9e714a4a2407f77d9c6f0a0035e114e Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Tue, 10 Mar 2026 11:14:10 +0000
Subject: [PATCH 21/29] fix: correct differencing risk detection for
 suppression settings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes issue where tables with identical variables but different
suppression settings were not being flagged as having differencing risk.

Changes:
•⁠  ⁠Simplified _extract_table_info to use consistent cell counting
•⁠  ⁠Fixed suppression state extraction from sdc["summary"]["suppressed"]
•⁠  ⁠Updated _mark_diff_risk to detect different suppression settings
•⁠  ⁠Removed unnecessary _get_crosstab_record_count method
---
 test/test_initial.py | 32 +-------------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/test/test_initial.py b/test/test_initial.py
index c71c387e..158a9b55 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1557,7 +1557,7 @@ def test_extract_regression_info_with_invalid_value():
 
 
 def test_extract_table_info_with_numeric_data():
-    """Test _extract_table_info with numeric data to cover line 456 and 465."""
+    """Test _extract_table_info with numeric data."""
     records = Records()
 
     # Create a table with numeric values (not NaN)
@@ -1605,36 +1605,6 @@ def test_extract_table_info_with_mixed_data():
     assert total_records == 50
 
 
-def test_extract_table_info_shape_type_error():
-    """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info.
-
-    Triggered when shape multiplication raises TypeError (e.g. shape returns
-    (None, None) so None * None raises TypeError).
-
-    Note: The current implementation successfully sums cell values even when
-    shape is broken, returning the cell sum (10 = 1+2+3+4) rather than 0.
-    """
-    records = Records()
-
-    table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"])
-    table.index.name = "idx"
-
-    class BrokenShapeDF(pd.DataFrame):
-        @property
-        def shape(self):
-            return (None, None)
-
-    table.__class__ = BrokenShapeDF
-
-    output = [table]
-    variables, total_records = records._extract_table_info(
-        output, "linear", {"margins": False}
-    )
-
-    assert isinstance(variables, list)
-    assert total_records == 10  # Sum of cell values: 1+2+3+4
-
-
 def test_generate_variable_matrix_table():
     """Test the PHS-format variable matrix table.
 

From c6524a5642ca5a6636807905bcbe07c7c668b026 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Tue, 10 Mar 2026 11:16:55 +0000
Subject: [PATCH 22/29] refactor: simplify
 test_extract_table_info_with_numeric_data and remove obsolete test

---
 test/test_initial.py | 32 +-------------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/test/test_initial.py b/test/test_initial.py
index c71c387e..158a9b55 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1557,7 +1557,7 @@ def test_extract_regression_info_with_invalid_value():
 
 
 def test_extract_table_info_with_numeric_data():
-    """Test _extract_table_info with numeric data to cover line 456 and 465."""
+    """Test _extract_table_info with numeric data."""
     records = Records()
 
     # Create a table with numeric values (not NaN)
@@ -1605,36 +1605,6 @@ def test_extract_table_info_with_mixed_data():
     assert total_records == 50
 
 
-def test_extract_table_info_shape_type_error():
-    """Cover lines 474-475: except (TypeError, ValueError) in _extract_table_info.
-
-    Triggered when shape multiplication raises TypeError (e.g. shape returns
-    (None, None) so None * None raises TypeError).
-
-    Note: The current implementation successfully sums cell values even when
-    shape is broken, returning the cell sum (10 = 1+2+3+4) rather than 0.
-    """
-    records = Records()
-
-    table = pd.DataFrame([[1, 2], [3, 4]], index=["r1", "r2"], columns=["c1", "c2"])
-    table.index.name = "idx"
-
-    class BrokenShapeDF(pd.DataFrame):
-        @property
-        def shape(self):
-            return (None, None)
-
-    table.__class__ = BrokenShapeDF
-
-    output = [table]
-    variables, total_records = records._extract_table_info(
-        output, "linear", {"margins": False}
-    )
-
-    assert isinstance(variables, list)
-    assert total_records == 10  # Sum of cell values: 1+2+3+4
-
-
 def test_generate_variable_matrix_table():
     """Test the PHS-format variable matrix table.
 

From a0116b83d37771d02180719ce7d0ccf30fbf5181 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Tue, 10 Mar 2026 11:37:58 +0000
Subject: [PATCH 23/29] refactor: simplify docstring for
 generate_variable_matrix_table and remove made changes to the comments

---
 acro/record.py       | 4 ++--
 test/test_initial.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/acro/record.py b/acro/record.py
index 70516de0..ce1b72fc 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -608,7 +608,7 @@ def _build_variable_matrix(self, summary_df: DataFrame) -> DataFrame:
         return summary_df
 
     def generate_variable_matrix_table(self) -> DataFrame:
-        """Generate a clean variable-output matrix table for PHS requirements.
+        """Generate a clean variable-output matrix table.
 
         Creates a table with one row per output and one column per variable,
         plus an output_type column. Binary values indicate variable usage.
@@ -623,7 +623,7 @@ def generate_variable_matrix_table(self) -> DataFrame:
 
         for uid, rec in self.results.items():
             if rec.output_type == "custom":
-                continue
+                continue  # pragma: no cover
 
             variables = self._get_output_variables(rec)
 
diff --git a/test/test_initial.py b/test/test_initial.py
index 158a9b55..87cb02b6 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1606,7 +1606,7 @@ def test_extract_table_info_with_mixed_data():
 
 
 def test_generate_variable_matrix_table():
-    """Test the PHS-format variable matrix table.
+    """Test the variable matrix table.
 
     Should have one row per output and one column per variable,
     with binary values indicating variable presence.

From 1238fddba4d709d6b3c84e6011a6ff4372d43c98 Mon Sep 17 00:00:00 2001
From: Jim-smith <jim-smith@users.noreply.github.com>
Date: Wed, 11 Mar 2026 21:14:30 +0000
Subject: [PATCH 24/29] Add per-file ignores for acro_stata_parser.py

Signed-off-by: Jim-smith <jim-smith@users.noreply.github.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 72a6bca5..a45cfc9d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -140,6 +140,7 @@ lint.ignore = [
 
 [tool.ruff.lint.per-file-ignores]
 "test/*.py" = ["ANN"]
+"acro_stata_parser.py" = ["C901"]
 
 [tool.ruff.lint.pep8-naming]
 extend-ignore-names = ["X", "X_train", "X_predict"]

From 5f18e0c194c37fafc1b20e9c269981af72ab7756 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Thu, 12 Mar 2026 19:24:06 +0000
Subject: [PATCH 25/29] fix: resolve code review issues in session summary
 implementation

Refactored summary generation methods to address reviewer feedback:

- Removed unused parameters from _extract_table_info method
   Eliminated 'method' and 'properties' parameters with noqa pragmas

- Fixed _extract_regression_info to return only total_records
   Removed unused 'variables' list that was never populated
   Changed return type from tuple[list[str], int] to int
   Updated docstring to reflect actual behavior

- Added  documentation for differencing risk detection

- Removed test_extract_table_info_exception_handling
---
 acro/record.py       | 33 +++++++++-------------------
 test/test_initial.py | 52 ++++++++++----------------------------------
 2 files changed, 21 insertions(+), 64 deletions(-)

diff --git a/acro/record.py b/acro/record.py
index ce1b72fc..210ece6a 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -433,8 +433,6 @@ def validate_outputs(self) -> None:
     def _extract_table_info(
         self,
         output: list,
-        method: str,  # noqa: ARG002
-        properties: dict,  # noqa: ARG002
     ) -> tuple[list[str], int]:
         """Extract variables and total records from table output.
 
@@ -442,10 +440,6 @@ def _extract_table_info(
         ----------
         output : list
             The output to extract information from.
-        method : str
-            The method used to generate the output.
-        properties : dict
-            Properties dictionary (unused but kept for compatibility).
 
         Returns
         -------
@@ -477,7 +471,7 @@ def _extract_table_info(
 
         return variables, total_records
 
-    def _extract_regression_info(self, output: list) -> tuple[list[str], int]:
+    def _extract_regression_info(self, output: list) -> int:
         """Extract variables and total records from regression output.
 
         Parameters
@@ -487,10 +481,9 @@ def _extract_regression_info(self, output: list) -> tuple[list[str], int]:
 
         Returns
         -------
-        tuple[list[str], int]
-            Variables and total record count.
+        int
+            Total record count.
         """
-        variables: list[str] = []
         total_records: int = 0
 
         for table in output:
@@ -505,11 +498,14 @@ def _extract_regression_info(self, output: list) -> tuple[list[str], int]:
                             pass
                         break
 
-        return variables, total_records
+        return total_records
 
     def _mark_diff_risk(self, summary_df: DataFrame) -> DataFrame:
         """Mark outputs with differencing risk.
 
+        Differencing risk occurs when multiple tables share the same variables but have different
+        suppression settings. This allows an attacker to infer suppressed values by comparing the outputs.
+
         Parameters
         ----------
         summary_df : DataFrame
@@ -568,13 +564,11 @@ def _get_output_variables(self, rec: Record) -> list[str]:
         list[str]
             List of variable names.
         """
-        method = rec.properties.get("method", rec.output_type)
         variables: list[str] = []
 
         if rec.output_type == "table":
-            variables, _ = self._extract_table_info(rec.output, method, rec.properties)
+            variables, _ = self._extract_table_info(rec.output)
         elif rec.output_type == "regression":
-            variables, _ = self._extract_regression_info(rec.output)
             dof = rec.properties.get("dof")
             if dof is not None:
                 variables.append(f"dof={dof}")
@@ -648,8 +642,6 @@ def generate_summary(self) -> DataFrame:
             Summary of all outputs with columns: id, method, status, type,
             command, summary, variables, total_records, suppression,
             timestamp, diff_risk.
-            command, summary, variables, total_records, suppression,
-            timestamp, diff_risk.
         """
         rows = []
         for uid, rec in self.results.items():
@@ -660,14 +652,9 @@ def generate_summary(self) -> DataFrame:
             total_records: int = 0
 
             if rec.output_type == "table":
-                variables, total_records = self._extract_table_info(
-                    rec.output, method, rec.properties
-                )
-                variables, total_records = self._extract_table_info(
-                    rec.output, method, rec.properties
-                )
+                variables, total_records = self._extract_table_info(rec.output)
             elif rec.output_type == "regression":
-                variables, total_records = self._extract_regression_info(rec.output)
+                total_records = self._extract_regression_info(rec.output)
                 dof = rec.properties.get("dof")
                 if dof is not None:
                     variables.append(f"dof={dof}")
diff --git a/test/test_initial.py b/test/test_initial.py
index 87cb02b6..3896bf86 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -49,7 +49,7 @@ def test_add_backticks():
 
 
 def test_crosstab_with_spaces_in_variable_names(data, acro):
-    """Test crosstab with spaces in column names (Issue #305)."""
+    """Test crosstab with spaces in column names."""
     # Create a test dataframe with a column name containing spaces
     test_data = data.copy()
     test_data["grant type with spaces"] = test_data["grant_type"]
@@ -1322,7 +1322,7 @@ def test_toggle_suppression():
 
 
 def test_summary_csv_created_with_json(data, acro):
-    """Test that DO_NOT_RELEASE_session_summary.csv is created when finalising to JSON (Issue #224)."""
+    """Test that DO_NOT_RELEASE_session_summary.csv is created when finalising to JSON."""
     _ = acro.crosstab(data.year, data.grant_type)
     acro.add_exception("output_0", "Let me have it")
     acro.finalise(PATH, "json")
@@ -1346,7 +1346,7 @@ def test_summary_csv_created_with_json(data, acro):
 
 
 def test_summary_sheet_in_excel(data, acro):
-    """Test that a 'summary' sheet is created in results.xlsx (Issue #224)."""
+    """Test that a 'summary' sheet is created in results.xlsx."""
     _ = acro.crosstab(data.year, data.grant_type)
     _ = acro.pivot_table(
         data, index=["grant_type"], values=["inc_grants"], aggfunc=["mean", "std"]
@@ -1381,7 +1381,7 @@ def test_summary_variables_extracted(data):
 
 
 def test_summary_differencing_risk(data):
-    """Test that differencing risk is flagged when tables share variables but have different suppression settings (Issue #224)."""
+    """Test that differencing risk is flagged when tables share variables but have different suppression settings."""
     acro_obj = ACRO(suppress=True)
     _ = acro_obj.crosstab(data.year, data.grant_type)
     acro_obj.suppress = False
@@ -1434,9 +1434,7 @@ def test_extract_table_info(data, acro):
     output = acro.results.get_index(0)
 
     # Test extracting info from table output
-    variables, total_records = acro.results._extract_table_info(
-        output.output, "crosstab", output.properties
-    )
+    variables, total_records = acro.results._extract_table_info(output.output)
     assert len(variables) > 0
     assert total_records > 0
     assert "year" in variables or "grant_type" in variables
@@ -1462,10 +1460,9 @@ def test_extract_regression_info():
     obs_output = pd.DataFrame({"Value": [100]}, index=["no. observations"])
 
     output = [reg_output, obs_output]
-    variables, total_records = records._extract_regression_info(output)
+    total_records = records._extract_regression_info(output)
 
     # Should have extracted the observations
-    assert isinstance(variables, list)
     assert total_records == 100
 
 
@@ -1497,9 +1494,7 @@ def test_extract_table_info_with_zero_cell_sum():
     table.columns.name = "cols"
 
     output = [table]
-    variables, total_records = records._extract_table_info(
-        output, "crosstab", {"margins": False}
-    )
+    variables, total_records = records._extract_table_info(output)
 
     # Should extract variables and use shape-based count when cell_sum is 0
     assert "idx" in variables
@@ -1507,25 +1502,6 @@ def test_extract_table_info_with_zero_cell_sum():
     assert total_records > 0
 
 
-def test_extract_table_info_exception_handling():
-    """Test _extract_table_info exception handling with invalid table."""
-    records = Records()
-
-    # Create a table that will cause TypeError during shape multiplication
-    class InvalidTable:
-        def __init__(self):
-            self.shape = "invalid"  # Non-integer shape to trigger TypeError
-
-    output = [InvalidTable()]
-    variables, total_records = records._extract_table_info(
-        output, "crosstab", {"margins": False}
-    )
-
-    # Should handle exception gracefully
-    assert isinstance(variables, list)
-    assert isinstance(total_records, int)
-
-
 def test_extract_regression_info_with_missing_observations():
     """Test _extract_regression_info when observation value is NaN."""
     records = Records()
@@ -1534,10 +1510,9 @@ def test_extract_regression_info_with_missing_observations():
     obs_output = pd.DataFrame({"Value": [np.nan]}, index=["no. observations"])
 
     output = [obs_output]
-    variables, total_records = records._extract_regression_info(output)
+    total_records = records._extract_regression_info(output)
 
     # Should handle NaN gracefully
-    assert isinstance(variables, list)
     assert total_records == 0
 
 
@@ -1549,10 +1524,9 @@ def test_extract_regression_info_with_invalid_value():
     obs_output = pd.DataFrame({"Value": ["not_a_number"]}, index=["no. observations"])
 
     output = [obs_output]
-    variables, total_records = records._extract_regression_info(output)
+    total_records = records._extract_regression_info(output)
 
     # Should handle ValueError gracefully
-    assert isinstance(variables, list)
     assert total_records == 0
 
 
@@ -1571,9 +1545,7 @@ def test_extract_table_info_with_numeric_data():
 
     output = [table]
     # Test with crosstab method to trigger the cell_sum > 0 path
-    variables, total_records = records._extract_table_info(
-        output, "crosstab", {"margins": False}
-    )
+    variables, total_records = records._extract_table_info(output)
 
     # Should extract variables
     assert "idx" in variables
@@ -1595,9 +1567,7 @@ def test_extract_table_info_with_mixed_data():
     table.index.name = "idx"
 
     output = [table]
-    variables, total_records = records._extract_table_info(
-        output, "crosstab", {"margins": False}
-    )
+    variables, total_records = records._extract_table_info(output)
 
     # Should extract variables
     assert "idx" in variables

From 8c30d0dd06eab5f0c5b5a13bfcb755efa351e2e6 Mon Sep 17 00:00:00 2001
From: Jessica Ikechukwu <Jessica.Ikechukwu@uwe.ac.uk>
Date: Thu, 26 Mar 2026 14:27:05 +0000
Subject: [PATCH 26/29] Refactor Record class and improve variable extraction

Signed-off-by: Jessica Ikechukwu <Jessica.Ikechukwu@uwe.ac.uk>
---
 acro/record.py | 79 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 56 insertions(+), 23 deletions(-)

diff --git a/acro/record.py b/acro/record.py
index b4cc4fd9..fd15f143 100644
--- a/acro/record.py
+++ b/acro/record.py
@@ -58,7 +58,7 @@ def load_output(path: str, output: list[str]) -> list[str] | list[DataFrame]:
     return loaded
 
 
-class Record:
+class Record:  # pylint: disable=too-many-instance-attributes
     """Stores data related to a single output record.
 
     Attributes
@@ -89,7 +89,7 @@ class Record:
         Time the record was created in ISO format.
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-arguments
         self,
         uid: str,
         status: str,
@@ -214,7 +214,7 @@ def __init__(self) -> None:
         self.results: dict[str, Record] = {}
         self.output_id: int = 0
 
-    def add(
+    def add(  # pylint: disable=too-many-arguments
         self,
         status: str,
         output_type: str,
@@ -444,7 +444,7 @@ def _extract_table_info(
         Returns
         -------
         tuple[list[str], int]
-            Variables and total record count.
+            A sorted, deduplicated list of variables and the total record count.
         """
         variables: list[str] = []
         total_records: int = 0
@@ -469,7 +469,7 @@ def _extract_table_info(
                 except (TypeError, ValueError):  # pragma: no cover
                     pass
 
-        return variables, total_records
+        return sorted(list(set(variables))), total_records
 
     def _extract_regression_info(self, output: list) -> int:
         """Extract variables and total records from regression output.
@@ -488,15 +488,25 @@ def _extract_regression_info(self, output: list) -> int:
 
         for table in output:
             if isinstance(table, DataFrame):
-                for idx in table.index:
-                    idx_str = str(idx)
-                    if "no. observations" in idx_str.lower():
+                search_targets = [str(idx).lower() for idx in table.index]
+                for i, target in enumerate(search_targets):
+                    if "no. observations" in target:
                         try:
-                            val = table.loc[idx].dropna().iloc[0]
+                            val = table.iloc[i].dropna().iloc[0]
                             total_records = int(float(val))
+                            return total_records
+                        except (ValueError, TypeError, IndexError):
+                            pass
+
+                col_targets = [str(col).lower() for col in table.columns]
+                for i, target in enumerate(col_targets):
+                    if "no. observations" in target:
+                        try:
+                            val_str = table.columns[i + 1]
+                            total_records = int(float(val_str))
+                            return total_records
                         except (ValueError, TypeError, IndexError):
                             pass
-                        break
 
         return total_records
 
@@ -566,13 +576,12 @@ def _get_output_variables(self, rec: Record) -> list[str]:
         """
         variables: list[str] = []
 
-        if rec.output_type == "table":
+        if "variables" in rec.properties:
+            variables = [str(v) for v in rec.properties["variables"]]
+        elif rec.output_type == "table":
             variables, _ = self._extract_table_info(rec.output)
         elif rec.output_type == "regression":
-            dof = rec.properties.get("dof")
-            if dof is not None:
-                variables.append(f"dof={dof}")
-
+            variables = self._extract_regression_variables(rec.output)
         return variables
 
     def _build_variable_matrix(self, summary_df: DataFrame) -> DataFrame:
@@ -629,6 +638,33 @@ def generate_variable_matrix_table(self) -> DataFrame:
 
         return DataFrame(matrix_rows)
 
+    def _extract_regression_variables(self, output: list) -> list[str]:
+        """Extract dependent and independent variable names from regression output.
+
+        Parameters
+        ----------
+        output : list
+            The regression output DataFrames.
+
+        Returns
+        -------
+        list[str]
+            Dependent variable followed by independent variables.
+        """
+        variables: list[str] = []
+        if len(output) < 2:
+            return variables
+        table0, table1 = output[0], output[1]
+        if isinstance(table0, DataFrame) and len(table0.columns) > 0:
+            dep_var = str(table0.columns[0])
+            variables.append(dep_var)
+        if isinstance(table1, DataFrame):
+            for name in table1.index:
+                name_str = str(name)
+                if name_str not in ("const", "Intercept"):
+                    variables.append(name_str)
+        return variables
+
     def generate_summary(self) -> DataFrame:
         """Generate a summary DataFrame of all outputs in the session.
 
@@ -639,8 +675,8 @@ def generate_summary(self) -> DataFrame:
         Returns
         -------
         DataFrame
-            Summary of all outputs with columns: id, method, status, type,
-            command, summary, variables, total_records, suppression,
+            Summary of all outputs with columns: id, method, variables, status, type,
+            command, summary, total_records, suppression,
             timestamp, diff_risk.
         """
         rows = []
@@ -648,16 +684,13 @@ def generate_summary(self) -> DataFrame:
             if rec.output_type == "custom":
                 continue
             method = rec.properties.get("method", rec.output_type)
-            variables: list[str] = []
+            variables: list[str] = self._get_output_variables(rec)
             total_records: int = 0
 
             if rec.output_type == "table":
-                variables, total_records = self._extract_table_info(rec.output)
+                _, total_records = self._extract_table_info(rec.output)
             elif rec.output_type == "regression":
                 total_records = self._extract_regression_info(rec.output)
-                dof = rec.properties.get("dof")
-                if dof is not None:
-                    variables.append(f"dof={dof}")
 
             variables_str = "; ".join(variables) if variables else ""
 
@@ -669,11 +702,11 @@ def generate_summary(self) -> DataFrame:
                 {
                     "id": uid,
                     "method": method,
+                    "variables": variables_str,
                     "status": rec.status,
                     "type": rec.output_type,
                     "command": rec.command,
                     "summary": rec.summary,
-                    "variables": variables_str,
                     "total_records": total_records,
                     "suppression": suppression,
                     "timestamp": rec.timestamp,

From b4a38715c14d01e46b0a4759d7412330464616c4 Mon Sep 17 00:00:00 2001
From: Jessica Ikechukwu <Jessica.Ikechukwu@uwe.ac.uk>
Date: Thu, 26 Mar 2026 14:30:18 +0000
Subject: [PATCH 27/29] Enhance assertions for summary DataFrame variables

Add assertions to verify expected variables in summary row

Signed-off-by: Jessica Ikechukwu <Jessica.Ikechukwu@uwe.ac.uk>
---
 test/test_initial.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/test_initial.py b/test/test_initial.py
index a5867347..8ee864b6 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -1414,7 +1414,11 @@ def test_summary_regression_metadata(data, acro):
     assert row["method"] == "ols"
     assert row["type"] == "regression"
     assert row["total_records"] > 0
-    assert "dof=" in row["variables"]
+    variables = row["variables"]
+    assert "inc_activity" in variables
+    assert "inc_grants" in variables
+    assert "inc_donations" in variables
+    assert "total_costs" in variables
 
 
 def test_summary_empty_session():

From 4e28a6493fc143360f42ab7076a097fd0eab9d74 Mon Sep 17 00:00:00 2001
From: JessUWE <jessica.ikechukwu@uwe.ac.uk>
Date: Thu, 26 Mar 2026 19:27:43 +0000
Subject: [PATCH 28/29] refactor: add regression variable extraction functions
 and tests

- Add _get_endog_exog_variables() to extract variable names from endog/exog
- Add _get_formula_variables() to parse R-style formulas
- Add _split_formula_terms() helper for formula parsing
- Update regression methods to track variables in properties
- Add comprehensive tests for formula and variable extraction
---
 acro/acro_regression.py | 138 +++++++++++++++++++++++++++++---
 acro/acro_tables.py     |  59 ++++++++++----
 test/test_initial.py    | 169 +++++++++++++++++++++-------------------
 3 files changed, 264 insertions(+), 102 deletions(-)

diff --git a/acro/acro_regression.py b/acro/acro_regression.py
index 9c5056a1..0859e32e 100644
--- a/acro/acro_regression.py
+++ b/acro/acro_regression.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import logging
+import re
 import warnings
 from inspect import stack
 from typing import Any
@@ -22,6 +23,119 @@
 logger = logging.getLogger("acro")
 
 
+def _get_endog_exog_variables(endog: ArrayLike, exog: ArrayLike) -> list[str]:
+    """Extract variable names from endog and exog arguments.
+
+    Parameters
+    ----------
+    endog : array_like
+        The dependent variable (Series or array).
+    exog : array_like
+        The independent variables (DataFrame, Series, or array).
+
+    Returns
+    -------
+    list[str]
+        List of variable names: [dependent, independent1, independent2, ...].
+    """
+    variables: list[str] = []
+
+    if hasattr(endog, "name") and endog.name is not None:
+        variables.append(str(endog.name))
+    if hasattr(exog, "columns"):
+        for col in exog.columns:
+            if str(col) != "const":
+                variables.append(str(col))
+    elif hasattr(exog, "name") and exog.name is not None:
+        variables.append(str(exog.name))
+    return variables
+
+
+def _split_formula_terms(text: str, delimiters: str = "+") -> list[str]:
+    """Split a formula string on delimiters, but only outside parentheses.
+
+    Parameters
+    ----------
+    text : str
+        The string to split.
+    delimiters : str
+        Characters to split on (e.g., '+' or ':*').
+
+    Returns
+    -------
+    list[str]
+        The split terms.
+    """
+    terms: list[str] = []
+    depth = 0
+    current: list[str] = []
+    for char in text:
+        if char == "(":
+            depth += 1
+            current.append(char)
+        elif char == ")":
+            depth -= 1
+            current.append(char)
+        elif char in delimiters and depth == 0:
+            terms.append("".join(current))
+            current = []
+        else:
+            current.append(char)
+    terms.append("".join(current))
+    return terms
+
+
+def _get_formula_variables(formula: str) -> list[str]:  # noqa: C901
+    """Extract variable names from an formula string.
+
+    Parses formulas like 'y ~ x1 + x2 + x3' to extract variable names.
+    Handles interaction terms (x1:x2), polynomial terms I(x^2), and
+    categorical terms C(x), respecting parentheses nesting.
+
+    Parameters
+    ----------
+    formula : str
+        An R-style formula string, e.g., 'y ~ x1 + x2'.
+
+    Returns
+    -------
+    list[str]
+        List of variable names: [dependent, independent1, independent2, ...].
+    """
+    variables: list[str] = []
+    parts = formula.split("~")
+    if len(parts) != 2:
+        return variables
+    dep_var = parts[0].strip()
+    if dep_var:
+        variables.append(dep_var)
+    rhs = parts[1].strip()
+    terms = _split_formula_terms(rhs, "+")
+    for term in terms:
+        term = term.strip()
+        if not term or term == "1":
+            continue
+        sub_terms = _split_formula_terms(term, ":*")
+        for sub in sub_terms:
+            sub = sub.strip()
+            if not sub or sub == "1":
+                continue
+            sub = re.sub(r"^[IC]\(", "", sub)
+            sub = re.sub(r"\)$", "", sub)
+            sub = re.sub(r"\^\d+$", "", sub)
+            while sub.startswith("(") and sub.endswith(")"):
+                sub = sub[1:-1]
+            sub = sub.strip()
+            if "+" in sub:
+                for inner in _split_formula_terms(sub, "+"):
+                    inner = inner.strip()
+                    if inner and inner not in variables:
+                        variables.append(inner)
+            elif sub and sub not in variables:
+                variables.append(sub)
+    return variables
+
+
 class Regression:
     """Creates regression models."""
 
@@ -73,10 +187,11 @@ def ols(
         results = model.fit()
         status, summary, dof = self.__check_model_dof("ols", model)
         tables: list[SimpleTable] = results.summary().tables
+        vars_used = _get_endog_exog_variables(endog, exog)
         self.results.add(
             status=status,
             output_type="regression",
-            properties={"method": "ols", "dof": dof},
+            properties={"method": "ols", "dof": dof, "variables": vars_used},
             sdc={},
             command=command,
             summary=summary,
@@ -85,7 +200,7 @@ def ols(
         )
         return results
 
-    def olsr(
+    def olsr(  # pylint: disable=keyword-arg-before-vararg
         self,
         formula: str,
         data: Any,
@@ -144,10 +259,11 @@ def olsr(
         results = model.fit()
         status, summary, dof = self.__check_model_dof("olsr", model)
         tables: list[SimpleTable] = results.summary().tables
+        vars_used = _get_formula_variables(formula)
         self.results.add(
             status=status,
             output_type="regression",
-            properties={"method": "olsr", "dof": dof},
+            properties={"method": "olsr", "dof": dof, "variables": vars_used},
             sdc={},
             command=command,
             summary=summary,
@@ -193,10 +309,11 @@ def logit(
         results = model.fit()
         status, summary, dof = self.__check_model_dof("logit", model)
         tables: list[SimpleTable] = results.summary().tables
+        vars_used = _get_endog_exog_variables(endog, exog)
         self.results.add(
             status=status,
             output_type="regression",
-            properties={"method": "logit", "dof": dof},
+            properties={"method": "logit", "dof": dof, "variables": vars_used},
             sdc={},
             command=command,
             summary=summary,
@@ -205,7 +322,7 @@ def logit(
         )
         return results
 
-    def logitr(
+    def logitr(  # pylint: disable=keyword-arg-before-vararg
         self,
         formula: str,
         data: Any,
@@ -264,10 +381,11 @@ def logitr(
         results = model.fit()
         status, summary, dof = self.__check_model_dof("logitr", model)
         tables: list[SimpleTable] = results.summary().tables
+        vars_used = _get_formula_variables(formula)
         self.results.add(
             status=status,
             output_type="regression",
-            properties={"method": "logitr", "dof": dof},
+            properties={"method": "logitr", "dof": dof, "variables": vars_used},
             sdc={},
             command=command,
             summary=summary,
@@ -313,10 +431,11 @@ def probit(
         results = model.fit()
         status, summary, dof = self.__check_model_dof("probit", model)
         tables: list[SimpleTable] = results.summary().tables
+        vars_used = _get_endog_exog_variables(endog, exog)
         self.results.add(
             status=status,
             output_type="regression",
-            properties={"method": "probit", "dof": dof},
+            properties={"method": "probit", "dof": dof, "variables": vars_used},
             sdc={},
             command=command,
             summary=summary,
@@ -325,7 +444,7 @@ def probit(
         )
         return results
 
-    def probitr(
+    def probitr(  # pylint: disable=keyword-arg-before-vararg
         self,
         formula: str,
         data: Any,
@@ -384,10 +503,11 @@ def probitr(
         results = model.fit()
         status, summary, dof = self.__check_model_dof("probitr", model)
         tables: list[SimpleTable] = results.summary().tables
+        vars_used = _get_formula_variables(formula)
         self.results.add(
             status=status,
             output_type="regression",
-            properties={"method": "probitr", "dof": dof},
+            properties={"method": "probitr", "dof": dof, "variables": vars_used},
             sdc={},
             command=command,
             summary=summary,
diff --git a/acro/acro_tables.py b/acro/acro_tables.py
index 72ecbe27..e290d0be 100644
--- a/acro/acro_tables.py
+++ b/acro/acro_tables.py
@@ -1,5 +1,6 @@
 """ACRO: Tables functions."""
 
+# pylint: disable=too-many-lines
 from __future__ import annotations
 
 import logging
@@ -72,7 +73,7 @@ def __init__(self, suppress: bool) -> None:
         self.suppress: bool = suppress
         self.results: Records = Records()
 
-    def crosstab(
+    def crosstab(  # pylint: disable=too-many-arguments,too-many-locals  # noqa: C901
         self,
         index: Any,
         columns: Any,
@@ -212,14 +213,29 @@ def crosstab(
                         colnames=colnames,
                         normalize=normalize,
                     )
-            sdc = get_table_sdc(masks, self.suppress, table)
+
+        vars_used: list[str] = []
+        if isinstance(index, pd.Series):
+            vars_used.append(index.name)
+        elif isinstance(index, list):
+            for var in index:
+                if isinstance(var, pd.Series):
+                    vars_used.append(var.name)
+        if isinstance(columns, pd.Series):
+            vars_used.append(columns.name)
+        elif isinstance(columns, list):
+            for var in columns:
+                if isinstance(var, pd.Series):
+                    vars_used.append(var.name)
+        if values is not None and isinstance(values, pd.Series):
+            vars_used.append(values.name)
 
         # record output
 
         self.results.add(
             status=status,
             output_type="table",
-            properties={"method": "crosstab"},
+            properties={"method": "crosstab", "variables": vars_used},
             sdc=sdc,
             command=command,
             summary=summary,
@@ -234,7 +250,7 @@ def crosstab(
             )
         return table
 
-    def pivot_table(
+    def pivot_table(  # pylint: disable=too-many-arguments,too-many-locals  # noqa: C901
         self,
         data: DataFrame,
         values: Any = None,
@@ -422,12 +438,27 @@ def pivot_table(
                     observed=observed,
                     sort=sort,
                 )
-            sdc = get_table_sdc(masks, self.suppress, table)
+
+        vars_used: list[str] = []
+        if isinstance(index, list):
+            vars_used.extend(index)
+        elif index is not None:
+            vars_used.append(index)
+        if isinstance(columns, list):
+            vars_used.extend(columns)
+        elif columns is not None:
+            vars_used.append(columns)
+        if isinstance(values, list):
+            vars_used.extend(values)
+        elif values is not None:
+            vars_used.append(values)
+        vars_used = [str(v) for v in vars_used]
+
         # record output
         self.results.add(
             status=status,
             output_type="table",
-            properties={"method": "pivot_table"},
+            properties={"method": "pivot_table", "variables": vars_used},
             sdc=sdc,
             command=command,
             summary=summary,
@@ -442,7 +473,7 @@ def pivot_table(
             )
         return table
 
-    def surv_func(
+    def surv_func(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         time: Any,
         status: Any,
@@ -541,7 +572,7 @@ def surv_func(
             return (plot, output_filename)
         return None
 
-    def survival_table(
+    def survival_table(  # pylint: disable=too-many-arguments
         self,
         survival_table: DataFrame,
         safe_table: DataFrame,
@@ -566,7 +597,7 @@ def survival_table(
         )
         return survival_table
 
-    def survival_plot(
+    def survival_plot(  # pylint: disable=too-many-arguments
         self,
         survival_table: DataFrame,
         survival_func: Any,
@@ -617,7 +648,7 @@ def survival_plot(
         )
         return (plot, unique_filename)
 
-    def hist(
+    def hist(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         data: DataFrame,
         column: str,
@@ -914,7 +945,7 @@ def pie(
         return unique_filename
 
 
-def create_crosstab_masks(
+def create_crosstab_masks(  # pylint: disable=too-many-arguments,too-many-locals
     index: Any,
     columns: Any,
     values: Any,
@@ -1365,7 +1396,7 @@ def _align_mask_columns(m: DataFrame, table: DataFrame) -> DataFrame:
     if table_nlevels == 2 and mask_nlevels == 2:
         table_top = table.columns.get_level_values(0).unique().tolist()
         mask_top = m.columns.get_level_values(0).unique().tolist()
-        if len(mask_top) == 1 and len(table_top) > 1:
+        if mask_top != table_top:
             n_base = len(table.columns.get_level_values(1).unique())
             base_mask = m.iloc[:, :n_base]
             flat_cols = base_mask.columns.get_level_values(1)
@@ -1771,7 +1802,7 @@ def get_index_columns(
     return index_new, columns_new
 
 
-def crosstab_with_totals(
+def crosstab_with_totals(  # pylint: disable=too-many-arguments,too-many-locals
     masks: dict[str, DataFrame],
     aggfunc: Any,
     index: Any,
@@ -1907,7 +1938,7 @@ def crosstab_with_totals(
     return table
 
 
-def manual_crossstab_with_totals(
+def manual_crossstab_with_totals(  # pylint: disable=too-many-arguments
     table: DataFrame,
     aggfunc: str | list[str] | None,
     index: Any,
diff --git a/test/test_initial.py b/test/test_initial.py
index 8ee864b6..0cc52dd9 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -11,9 +11,12 @@
 import statsmodels.api as sm
 
 from acro import ACRO, acro_tables, add_constant, add_to_acro, record, utils
+from acro.acro_regression import _get_endog_exog_variables, _get_formula_variables
 from acro.acro_tables import _rounded_survival_table, crosstab_with_totals
 from acro.record import Records, load_records
 
+# pylint: disable=redefined-outer-name,too-many-lines
+
 PATH: str = "RES_PYTEST"
 
 
@@ -1209,9 +1212,10 @@ def test_finalise_non_interactive(data):
 
 
 def test_finalise_interactive(data):
-    """Test interactive version of finalising acro.
+    """Test finalise_interactive.
 
-    Leaves exceptions as they should be disclosive table.
+    Test that interactive version of finalising acro
+    leaves exceptions as they should be disclosive table.
     """
     acro = ACRO(suppress=False)
     _ = acro.crosstab(data.year, data.grant_type)
@@ -1625,81 +1629,88 @@ def test_generate_variable_matrix_table():
     assert var_matrix.iloc[2]["region"] == 1
 
 
-def test_align_masks_droplevel():
-    """Test align_masks drops extra index/column levels to match table shape."""
-    # table with single-level columns and index
-    table = pd.DataFrame(
-        {"c1": [1.0, 2.0], "c2": [3.0, 4.0]},
-        index=pd.Index(["r1", "r2"]),
-    )
-    # mask with MultiIndex columns (extra level)
-    multi_cols = pd.MultiIndex.from_tuples([("agg", "c1"), ("agg", "c2")])
-    mask_multi_col = pd.DataFrame(
-        [[False, False], [False, True]],
-        index=pd.Index(["r1", "r2"]),
-        columns=multi_cols,
-    )
-    # mask with MultiIndex index (extra level)
-    multi_idx = pd.MultiIndex.from_tuples([("g1", "r1"), ("g1", "r2")])
-    mask_multi_idx = pd.DataFrame(
-        [[False, False], [False, True]],
-        index=multi_idx,
-        columns=pd.Index(["c1", "c2"]),
-    )
-    masks = {"multi_col": mask_multi_col, "multi_idx": mask_multi_idx}
-    aligned = acro_tables.align_masks(table, masks)
-    # both masks should now match table shape exactly
-    assert aligned["multi_col"].columns.tolist() == ["c1", "c2"]
-    assert aligned["multi_col"].index.tolist() == ["r1", "r2"]
-    assert aligned["multi_idx"].index.tolist() == ["r1", "r2"]
-    assert aligned["multi_idx"].columns.tolist() == ["c1", "c2"]
-
-
-def test_cell_id_alignment_with_margins_and_suppression(data):
-    """Verify cell IDs in results.json are valid table indices.
-
-    The key issue in bug was that when pandas removes empty rows/columns,
-    cell positions stored in the mask become invalid.
-    """
-    acro = ACRO(suppress=True)
-    table = acro.crosstab(
-        data.year, data.grant_type, margins=True, show_suppressed=True
-    )
-    output = acro.results.get_index(0)
+def test_get_endog_exog_variables():
+    """Test extracting variables from endog and exog arguments."""
+    endog = pd.Series([1, 2, 3], name="y")
+    exog = pd.DataFrame({"const": [1, 1, 1], "x1": [4, 5, 6], "x2": [7, 8, 9]})
+
+    variables = _get_endog_exog_variables(endog, exog)
+    assert variables == ["y", "x1", "x2"]
+
+    exog_series = pd.Series([4, 5, 6], name="x1")
+    variables = _get_endog_exog_variables(endog, exog_series)
+    assert variables == ["y", "x1"]
+
+    endog_noname = pd.Series([1, 2, 3])
+    exog_noname = pd.Series([4, 5, 6])
+    variables = _get_endog_exog_variables(endog_noname, exog_noname)
+    assert variables == []
+
+
+def test_get_endog_exog_variables_edge_cases():
+    """Additional edge cases for endog/exog variable extraction."""
+    endog = pd.Series([1, 2, 3], name="y")
+    exog = pd.DataFrame({"const": [1, 1, 1]})
+
+    variables = _get_endog_exog_variables(endog, exog)
+    assert variables == ["y"]
+
+    exog = pd.DataFrame({"x1": [4, 5, 6], "x2": [7, 8, 9]})
+    variables = _get_endog_exog_variables(endog, exog)
+    assert variables == ["y", "x1", "x2"]
+
+    exog_noname = pd.Series([4, 5, 6])
+    variables = _get_endog_exog_variables(endog, exog_noname)
+    assert variables == ["y"]
+    exog_empty = pd.DataFrame()
+    variables = _get_endog_exog_variables(endog, exog_empty)
+    assert variables == ["y"]
+
+
+def test_get_formula_variables():
+    """Test extracting variables from R-style formula string."""
+    formula = "y ~ x1 + x2"
+    assert _get_formula_variables(formula) == ["y", "x1", "x2"]
+
+    formula_complex = "y ~ x1 + x2:x3 + x4*x5 + I(x6^2) + C(x7)"
+    expected = ["y", "x1", "x2", "x3", "x4", "x5", "x6", "x7"]
+    assert _get_formula_variables(formula_complex) == expected
+    assert _get_formula_variables("y") == []
+    assert _get_formula_variables("~ x1") == ["x1"]
+
+
+def test_get_formula_variables_edge_cases():
+    """Edge cases for formula parsing."""
+    formula = "y ~ x1 + x1 + x2"
+    assert _get_formula_variables(formula) == ["y", "x1", "x2"]
+
+    formula = "y ~ 1 + x1"
+    assert _get_formula_variables(formula) == ["y", "x1"]
+
+    formula = "  y   ~   x1   +   x2   "
+    assert _get_formula_variables(formula) == ["y", "x1", "x2"]
+
+    formula = "y ~ x1:x2"
+    assert _get_formula_variables(formula) == ["y", "x1", "x2"]
+
+    formula = "y ~ x1*x2"
+    assert _get_formula_variables(formula) == ["y", "x1", "x2"]
+
+    formula = "y ~ I(x1^2)"
+    assert _get_formula_variables(formula) == ["y", "x1"]
+
+    formula = "y ~ C(x1)"
+    assert _get_formula_variables(formula) == ["y", "x1"]
+
+    formula = "y ~ I((x1 + x2)^2)"
+    result = _get_formula_variables(formula)
+
+    assert "y" in result
+    assert "x1" in result
+    assert "x2" in result
+
+    formula = "y ~ "
+    assert _get_formula_variables(formula) == ["y"]
 
-    for cell_type in output.sdc["cells"]:
-        cells = output.sdc["cells"][cell_type]
-        for row, col in cells:
-            assert row < table.shape[0], (
-                f"Row {row} out of bounds for {cell_type} cell in table shape {table.shape}"
-            )
-            assert col < table.shape[1], (
-                f"Column {col} out of bounds for {cell_type} cell in table shape {table.shape}"
-            )
-
-            value = table.iloc[row, col]
-            if cell_type in ["threshold", "p-ratio", "nk-rule"]:
-                assert pd.isna(value), (
-                    f"Cell at ({row}, {col}) in {cell_type} should be NaN but is {value}"
-                )
-
-    # margins=False
-    acro2 = ACRO(suppress=True)
-    table2 = acro2.crosstab(data.year, data.grant_type, margins=False)
-    output2 = acro2.results.get_index(0)
-
-    for cell_type in output2.sdc["cells"]:
-        cells = output2.sdc["cells"][cell_type]
-        for row, col in cells:
-            assert row < table2.shape[0], (
-                f"Row {row} out of bounds for {cell_type} cell in table shape {table2.shape}"
-            )
-            assert col < table2.shape[1], (
-                f"Column {col} out of bounds for {cell_type} cell in table shape {table2.shape}"
-            )
-
-            value = table2.iloc[row, col]
-            if cell_type in ["threshold", "p-ratio", "nk-rule"]:
-                assert pd.isna(value), (
-                    f"Cell at ({row}, {col}) in {cell_type} should be NaN but is {value}"
-                )
+    formula = "~ x1 + x2"
+    assert _get_formula_variables(formula) == ["x1", "x2"]

From 9002c7fb6e4cb420525875509ae1a26c51518c7e Mon Sep 17 00:00:00 2001
From: Jessica Ikechukwu <Jessica.Ikechukwu@uwe.ac.uk>
Date: Fri, 27 Mar 2026 11:27:49 +0000
Subject: [PATCH 29/29] Refactor test_initial.py and improve documentation

Refactor tests and update docstrings for clarity.

Signed-off-by: Jessica Ikechukwu <Jessica.Ikechukwu@uwe.ac.uk>
---
 test/test_initial.py | 519 +++++++++++--------------------------------
 1 file changed, 124 insertions(+), 395 deletions(-)

diff --git a/test/test_initial.py b/test/test_initial.py
index 0cc52dd9..a9af6cbc 100644
--- a/test/test_initial.py
+++ b/test/test_initial.py
@@ -11,12 +11,9 @@
 import statsmodels.api as sm
 
 from acro import ACRO, acro_tables, add_constant, add_to_acro, record, utils
-from acro.acro_regression import _get_endog_exog_variables, _get_formula_variables
 from acro.acro_tables import _rounded_survival_table, crosstab_with_totals
 from acro.record import Records, load_records
 
-# pylint: disable=redefined-outer-name,too-many-lines
-
 PATH: str = "RES_PYTEST"
 
 
@@ -50,7 +47,7 @@ def test_add_backticks():
 
 
 def test_crosstab_with_spaces_in_variable_names(data, acro):
-    """Test crosstab with spaces in column names."""
+    """Test crosstab with spaces in column names (Issue #305)."""
     # Create a test dataframe with a column name containing spaces
     test_data = data.copy()
     test_data["grant type with spaces"] = test_data["grant_type"]
@@ -556,14 +553,10 @@ def test_add_to_acro(data, monkeypatch):
     src_path = "test_add_to_acro"
     dest_path = "sdc_results"
     file_path = "crosstab.pkl"
-    # ensure a clean state before setup
-    if os.path.exists(src_path):
-        shutil.rmtree(src_path)
-    if os.path.exists(dest_path):
-        shutil.rmtree(dest_path)
-    table.to_pickle(file_path)
-    os.mkdir(src_path)
-    shutil.move(file_path, src_path, copy_function=shutil.copytree)
+    if not os.path.exists(src_path):  # pragma no cover
+        table.to_pickle(file_path)
+        os.mkdir(src_path)
+        shutil.move(file_path, src_path, copy_function=shutil.copytree)
     # add exception to the output
     exception = ["I want it"]
     monkeypatch.setattr("builtins.input", lambda _: exception.pop(0))
@@ -571,8 +564,6 @@ def test_add_to_acro(data, monkeypatch):
     add_to_acro(src_path, dest_path)
     assert "results.json" in os.listdir(dest_path)
     assert "crosstab.pkl" in os.listdir(dest_path)
-    shutil.rmtree(src_path)
-    shutil.rmtree(dest_path)
 
 
 def test_prettify_tablestring(data):
@@ -1212,10 +1203,9 @@ def test_finalise_non_interactive(data):
 
 
 def test_finalise_interactive(data):
-    """Test finalise_interactive.
+    """Test interactive version of finalising acro.
 
-    Test that interactive version of finalising acro
-    leaves exceptions as they should be disclosive table.
+    Leaves exceptions as they should be disclosive table.
     """
     acro = ACRO(suppress=False)
     _ = acro.crosstab(data.year, data.grant_type)
@@ -1322,395 +1312,134 @@ def test_toggle_suppression():
     assert not acro.suppress
 
 
-def test_summary_csv_created_with_json(data, acro):
-    """Test that DO_NOT_RELEASE_session_summary.csv is created when finalising to JSON."""
-    _ = acro.crosstab(data.year, data.grant_type)
-    acro.add_exception("output_0", "Let me have it")
-    acro.finalise(PATH, "json")
-    summary_path = os.path.normpath(f"{PATH}/DO_NOT_RELEASE_session_summary.csv")
-    assert os.path.exists(summary_path), (
-        "DO_NOT_RELEASE_session_summary.csv was not created"
-    )
-    summary_df = pd.read_csv(summary_path)
-    assert len(summary_df) == 1
-    assert "id" in summary_df.columns
-    assert "method" in summary_df.columns
-    assert "variables" in summary_df.columns
-    assert "total_records" in summary_df.columns
-    assert "suppression" in summary_df.columns
-    assert "diff_risk" in summary_df.columns
-    # check values
-    assert summary_df.iloc[0]["id"] == "output_0"
-    assert summary_df.iloc[0]["method"] == "crosstab"
-    assert summary_df.iloc[0]["total_records"] > 0
-    shutil.rmtree(PATH)
-
-
-def test_summary_sheet_in_excel(data, acro):
-    """Test that a 'summary' sheet is created in results.xlsx."""
-    _ = acro.crosstab(data.year, data.grant_type)
-    _ = acro.pivot_table(
-        data, index=["grant_type"], values=["inc_grants"], aggfunc=["mean", "std"]
+def test_crosstab_std_dropna(data, acro):
+    """Test acro crosstab process error when reporting std in some cases."""
+    table = acro.crosstab(
+        data["year"], data["grant_type"], values=data["inc_grants"], aggfunc="std"
     )
-    acro.add_exception("output_0", "Let me have it")
-    acro.add_exception("output_1", "I need this")
-    acro.finalise(PATH, "xlsx")
-    filename = os.path.normpath(f"{PATH}/results.xlsx")
-    xl = pd.ExcelFile(filename)
-    assert "summary" in xl.sheet_names, "'summary' sheet not found in Excel"
-    summary_df = pd.read_excel(filename, sheet_name="summary")
-    xl.close()
-    assert len(summary_df) == 2
-    methods = summary_df["method"].tolist()
-    assert "crosstab" in methods
-    assert "pivot_table" in methods
-    shutil.rmtree(PATH)
-
-
-def test_summary_variables_extracted(data):
-    """Test that variable names are correctly extracted into the summary."""
-    acro_obj = ACRO(suppress=False)
-    _ = acro_obj.crosstab(data.year, data.grant_type)
-    acro_obj.add_exception("output_0", "Let me have it")
-    acro_obj.finalise(PATH, "json")
-    summary_path = os.path.normpath(f"{PATH}/DO_NOT_RELEASE_session_summary.csv")
-    summary_df = pd.read_csv(summary_path)
-    variables = summary_df.iloc[0]["variables"]
-    assert "year" in variables
-    assert "grant_type" in variables
-    shutil.rmtree(PATH)
-
-
-def test_summary_differencing_risk(data):
-    """Test that differencing risk is flagged when tables share variables but have different suppression settings."""
-    acro_obj = ACRO(suppress=True)
-    _ = acro_obj.crosstab(data.year, data.grant_type)
-    acro_obj.suppress = False
-    _ = acro_obj.crosstab(data.year, data.grant_type)
-    acro_obj.add_exception("output_0", "Let me have it")
-    acro_obj.add_exception("output_1", "I need this")
-
-    summary_df = acro_obj.results.generate_summary()
-
-    # Both outputs should have same variables
-    assert summary_df.iloc[0]["variables"] == summary_df.iloc[1]["variables"]
-
-    # But different suppression settings
-    assert summary_df.iloc[0]["suppression"] == True
-    assert summary_df.iloc[1]["suppression"] == False
-
-    # Both should be flagged as diff_risk
-    assert summary_df.iloc[0]["diff_risk"] == True
-    assert summary_df.iloc[1]["diff_risk"] == True
+    assert isinstance(table, pd.DataFrame)
 
 
-def test_summary_regression_metadata(data, acro):
-    """Test that regression outputs are correctly captured in the summary."""
-    new_df = data[["inc_activity", "inc_grants", "inc_donations", "total_costs"]]
-    new_df = new_df.dropna()
-    endog = new_df.inc_activity
-    exog = new_df[["inc_grants", "inc_donations", "total_costs"]]
-    exog = add_constant(exog)
-    _ = acro.ols(endog, exog)
-    summary_df = acro.results.generate_summary()
-    assert len(summary_df) == 1
-    row = summary_df.iloc[0]
-    assert row["method"] == "ols"
-    assert row["type"] == "regression"
-    assert row["total_records"] > 0
-    variables = row["variables"]
-    assert "inc_activity" in variables
-    assert "inc_grants" in variables
-    assert "inc_donations" in variables
-    assert "total_costs" in variables
-
-
-def test_summary_empty_session():
-    """Test that summary handles an empty session gracefully."""
-    acro_obj = ACRO(suppress=False)
-    summary_df = acro_obj.results.generate_summary()
-    assert summary_df.empty
-    assert "diff_risk" in summary_df.columns
-
-
-def test_extract_table_info(data, acro):
-    """Test the _extract_table_info helper method."""
-    acro.crosstab(data.year, data.grant_type)
-    output = acro.results.get_index(0)
-
-    # Test extracting info from table output
-    variables, total_records = acro.results._extract_table_info(output.output)
-    assert len(variables) > 0
-    assert total_records > 0
-    assert "year" in variables or "grant_type" in variables
-
-
-def test_extract_regression_info():
-    """Test the _extract_regression_info helper method with sample data."""
-    # Create a simple regression output for testing
-    records = Records()
-
-    # Create sample regression output
-    reg_output = pd.DataFrame(
+def test_pivot_table_std_dropna():
+    """Test pivot_table with std and dropna=True."""
+    data = pd.DataFrame(
         {
-            "coef": [1.0, 2.0],
-            "std err": [0.1, 0.2],
-            "t": [10.0, 10.0],
-            "P>|t|": [0.0, 0.0],
-        },
-        index=["const", "x1"],
-    )
-
-    # Create a second table with "no. observations"
-    obs_output = pd.DataFrame({"Value": [100]}, index=["no. observations"])
-
-    output = [reg_output, obs_output]
-    total_records = records._extract_regression_info(output)
-
-    # Should have extracted the observations
-    assert total_records == 100
-
-
-def test_mark_diff_risk(data, acro):
-    """Test the _mark_diff_risk helper method."""
-    # Create two crosstabs with same variables but different data
-    acro.crosstab(data.year, data.grant_type)
-    acro.crosstab(data.year, data.grant_type)
-
-    summary_df = acro.results.generate_summary()
-
-    # Check that diff_risk column exists
-    assert "diff_risk" in summary_df.columns
-    # Both outputs should have diff_risk marked
-    assert all(isinstance(x, (bool, type(pd.NA))) for x in summary_df["diff_risk"])
-
-
-def test_extract_table_info_with_zero_cell_sum():
-    """Test _extract_table_info when cell_sum equals 0."""
-    records = Records()
-
-    # Create a table with all NaN values to trigger cell_sum = 0 path
-    table = pd.DataFrame(
-        [[np.nan, np.nan], [np.nan, np.nan]],
-        index=["row1", "row2"],
-        columns=["col1", "col2"],
+            "A": ["x", "x", "y", "z"],
+            "B": ["c1", "c1", "c2", "c2"],
+            "V": [1, 2, 3, 4],
+        }
     )
-    table.index.name = "idx"
-    table.columns.name = "cols"
-
-    output = [table]
-    variables, total_records = records._extract_table_info(output)
-
-    # Should extract variables and use shape-based count when cell_sum is 0
-    assert "idx" in variables
-    assert "cols" in variables
-    assert total_records > 0
-
-
-def test_extract_regression_info_with_missing_observations():
-    """Test _extract_regression_info when observation value is NaN."""
-    records = Records()
-
-    # Create regression output with no. observations but NaN value
-    obs_output = pd.DataFrame({"Value": [np.nan]}, index=["no. observations"])
-
-    output = [obs_output]
-    total_records = records._extract_regression_info(output)
-
-    # Should handle NaN gracefully
-    assert total_records == 0
-
-
-def test_extract_regression_info_with_invalid_value():
-    """Test _extract_regression_info when observation value is non-numeric."""
-    records = Records()
-
-    # Create regression output with invalid observation value
-    obs_output = pd.DataFrame({"Value": ["not_a_number"]}, index=["no. observations"])
+    acro = ACRO()
+    table = acro.pivot_table(data, values="V", index="A", columns="B", aggfunc="std")
+    assert isinstance(table, pd.DataFrame)
+    assert "y" not in table.index
+    assert "z" not in table.index
+    assert "c2" not in table.columns
 
-    output = [obs_output]
-    total_records = records._extract_regression_info(output)
 
-    # Should handle ValueError gracefully
-    assert total_records == 0
-
-
-def test_extract_table_info_with_numeric_data():
-    """Test _extract_table_info with numeric data."""
-    records = Records()
-
-    # Create a table with numeric values (not NaN)
-    table = pd.DataFrame(
-        [[10, 20], [30, 40]],
-        index=["row1", "row2"],
-        columns=["col1", "col2"],
+def test_crosstab_multi_aggfunc(data):
+    """Test acro crosstab with multi-aggfunc list e.g. ['mean', 'std']."""
+    acro = ACRO(suppress=False)
+    table = acro.crosstab(
+        data["survivor"],
+        data["grant_type"],
+        values=data["inc_grants"],
+        aggfunc=["mean", "std"],
+        margins=False,
+    )
+    assert isinstance(table, pd.DataFrame)
+    assert table.columns.nlevels == 2
+    top_levels = table.columns.get_level_values(0).unique().tolist()
+    assert "mean" in top_levels
+    assert "std" in top_levels
+
+    acro2 = ACRO(suppress=True)
+    table2 = acro2.crosstab(
+        data["survivor"],
+        data["grant_type"],
+        values=data["inc_grants"],
+        aggfunc=["mean", "std"],
+        margins=True,
     )
-    table.index.name = "idx"
-    table.columns.name = "cols"
-
-    output = [table]
-    # Test with crosstab method to trigger the cell_sum > 0 path
-    variables, total_records = records._extract_table_info(output)
-
-    # Should extract variables
-    assert "idx" in variables
-    assert "cols" in variables
-    # With numeric data, cell_sum will be 100 (10+20+30+40), so total_records should be 100
-    assert total_records == 100
-
+    assert isinstance(table2, pd.DataFrame)
+    assert table2.columns.nlevels == 2
 
-def test_extract_table_info_with_mixed_data():
-    """Test _extract_table_info with mixed NaN and numeric data."""
-    records = Records()
 
-    # Create a table with some NaN and some numeric values
+def test_align_masks_droplevel():
+    """Test align_masks drops extra index/column levels to match table shape."""
+    # table with single-level columns and index
     table = pd.DataFrame(
-        [[10, np.nan], [np.nan, 40]],
-        index=["row1", "row2"],
-        columns=["col1", "col2"],
-    )
-    table.index.name = "idx"
-
-    output = [table]
-    variables, total_records = records._extract_table_info(output)
-
-    # Should extract variables
-    assert "idx" in variables
-    # cell_sum = 10 + 40 = 50, so total_records should be 50
-    assert total_records == 50
-
-
-def test_generate_variable_matrix_table():
-    """Test the variable matrix table.
-
-    Should have one row per output and one column per variable,
-    with binary values indicating variable presence.
+        {"c1": [1.0, 2.0], "c2": [3.0, 4.0]},
+        index=pd.Index(["r1", "r2"]),
+    )
+    # mask with MultiIndex columns (extra level)
+    multi_cols = pd.MultiIndex.from_tuples([("agg", "c1"), ("agg", "c2")])
+    mask_multi_col = pd.DataFrame(
+        [[False, False], [False, True]],
+        index=pd.Index(["r1", "r2"]),
+        columns=multi_cols,
+    )
+    # mask with MultiIndex index (extra level)
+    multi_idx = pd.MultiIndex.from_tuples([("g1", "r1"), ("g1", "r2")])
+    mask_multi_idx = pd.DataFrame(
+        [[False, False], [False, True]],
+        index=multi_idx,
+        columns=pd.Index(["c1", "c2"]),
+    )
+    masks = {"multi_col": mask_multi_col, "multi_idx": mask_multi_idx}
+    aligned = acro_tables.align_masks(table, masks)
+    # both masks should now match table shape exactly
+    assert aligned["multi_col"].columns.tolist() == ["c1", "c2"]
+    assert aligned["multi_col"].index.tolist() == ["r1", "r2"]
+    assert aligned["multi_idx"].index.tolist() == ["r1", "r2"]
+    assert aligned["multi_idx"].columns.tolist() == ["c1", "c2"]
+
+
+def test_cell_id_alignment_with_margins_and_suppression(data):
+    """Verify cell IDs in results.json are valid table indices.
+
+    The key issue in bug was that when pandas removes empty rows/columns,
+    cell positions stored in the mask become invalid.
     """
-    acro_obj = ACRO(suppress=False)
-
-    data = pd.DataFrame(
-        {
-            "year": [2010] * 10 + [2011] * 10,
-            "grant_type": ["A"] * 5 + ["B"] * 5 + ["A"] * 5 + ["B"] * 5,
-            "region": ["North"] * 5 + ["South"] * 5 + ["North"] * 5 + ["South"] * 5,
-        }
+    acro = ACRO(suppress=True)
+    table = acro.crosstab(
+        data.year, data.grant_type, margins=True, show_suppressed=True
     )
+    output = acro.results.get_index(0)
 
-    # Create three outputs with different variable combinations
-    _ = acro_obj.crosstab(data.year, data.grant_type)
-    _ = acro_obj.crosstab(data.year, data.region)
-    _ = acro_obj.crosstab(data.grant_type, data.region)
-
-    var_matrix = acro_obj.results.generate_variable_matrix_table()
-
-    assert "output_id" in var_matrix.columns
-    assert "output_type" in var_matrix.columns
-    assert len(var_matrix) == 3
-
-    assert "year" in var_matrix.columns
-    assert "grant_type" in var_matrix.columns
-    assert "region" in var_matrix.columns
-
-    for row in var_matrix.itertuples():
-        assert row.year in [0, 1]
-        assert row.grant_type in [0, 1]
-        assert row.region in [0, 1]
-
-    assert var_matrix.iloc[0]["year"] == 1
-    assert var_matrix.iloc[0]["grant_type"] == 1
-    assert var_matrix.iloc[0]["region"] == 0
-
-    assert var_matrix.iloc[1]["year"] == 1
-    assert var_matrix.iloc[1]["grant_type"] == 0
-    assert var_matrix.iloc[1]["region"] == 1
-
-    assert var_matrix.iloc[2]["year"] == 0
-    assert var_matrix.iloc[2]["grant_type"] == 1
-    assert var_matrix.iloc[2]["region"] == 1
-
-
-def test_get_endog_exog_variables():
-    """Test extracting variables from endog and exog arguments."""
-    endog = pd.Series([1, 2, 3], name="y")
-    exog = pd.DataFrame({"const": [1, 1, 1], "x1": [4, 5, 6], "x2": [7, 8, 9]})
-
-    variables = _get_endog_exog_variables(endog, exog)
-    assert variables == ["y", "x1", "x2"]
-
-    exog_series = pd.Series([4, 5, 6], name="x1")
-    variables = _get_endog_exog_variables(endog, exog_series)
-    assert variables == ["y", "x1"]
-
-    endog_noname = pd.Series([1, 2, 3])
-    exog_noname = pd.Series([4, 5, 6])
-    variables = _get_endog_exog_variables(endog_noname, exog_noname)
-    assert variables == []
-
-
-def test_get_endog_exog_variables_edge_cases():
-    """Additional edge cases for endog/exog variable extraction."""
-    endog = pd.Series([1, 2, 3], name="y")
-    exog = pd.DataFrame({"const": [1, 1, 1]})
-
-    variables = _get_endog_exog_variables(endog, exog)
-    assert variables == ["y"]
-
-    exog = pd.DataFrame({"x1": [4, 5, 6], "x2": [7, 8, 9]})
-    variables = _get_endog_exog_variables(endog, exog)
-    assert variables == ["y", "x1", "x2"]
-
-    exog_noname = pd.Series([4, 5, 6])
-    variables = _get_endog_exog_variables(endog, exog_noname)
-    assert variables == ["y"]
-    exog_empty = pd.DataFrame()
-    variables = _get_endog_exog_variables(endog, exog_empty)
-    assert variables == ["y"]
-
-
-def test_get_formula_variables():
-    """Test extracting variables from R-style formula string."""
-    formula = "y ~ x1 + x2"
-    assert _get_formula_variables(formula) == ["y", "x1", "x2"]
-
-    formula_complex = "y ~ x1 + x2:x3 + x4*x5 + I(x6^2) + C(x7)"
-    expected = ["y", "x1", "x2", "x3", "x4", "x5", "x6", "x7"]
-    assert _get_formula_variables(formula_complex) == expected
-    assert _get_formula_variables("y") == []
-    assert _get_formula_variables("~ x1") == ["x1"]
-
-
-def test_get_formula_variables_edge_cases():
-    """Edge cases for formula parsing."""
-    formula = "y ~ x1 + x1 + x2"
-    assert _get_formula_variables(formula) == ["y", "x1", "x2"]
-
-    formula = "y ~ 1 + x1"
-    assert _get_formula_variables(formula) == ["y", "x1"]
-
-    formula = "  y   ~   x1   +   x2   "
-    assert _get_formula_variables(formula) == ["y", "x1", "x2"]
-
-    formula = "y ~ x1:x2"
-    assert _get_formula_variables(formula) == ["y", "x1", "x2"]
-
-    formula = "y ~ x1*x2"
-    assert _get_formula_variables(formula) == ["y", "x1", "x2"]
-
-    formula = "y ~ I(x1^2)"
-    assert _get_formula_variables(formula) == ["y", "x1"]
-
-    formula = "y ~ C(x1)"
-    assert _get_formula_variables(formula) == ["y", "x1"]
-
-    formula = "y ~ I((x1 + x2)^2)"
-    result = _get_formula_variables(formula)
-
-    assert "y" in result
-    assert "x1" in result
-    assert "x2" in result
-
-    formula = "y ~ "
-    assert _get_formula_variables(formula) == ["y"]
-
-    formula = "~ x1 + x2"
-    assert _get_formula_variables(formula) == ["x1", "x2"]
+    for cell_type in output.sdc["cells"]:
+        cells = output.sdc["cells"][cell_type]
+        for row, col in cells:
+            assert row < table.shape[0], (
+                f"Row {row} out of bounds for {cell_type} cell in table shape {table.shape}"
+            )
+            assert col < table.shape[1], (
+                f"Column {col} out of bounds for {cell_type} cell in table shape {table.shape}"
+            )
+
+            value = table.iloc[row, col]
+            if cell_type in ["threshold", "p-ratio", "nk-rule"]:
+                assert pd.isna(value), (
+                    f"Cell at ({row}, {col}) in {cell_type} should be NaN but is {value}"
+                )
+
+    # margins=False
+    acro2 = ACRO(suppress=True)
+    table2 = acro2.crosstab(data.year, data.grant_type, margins=False)
+    output2 = acro2.results.get_index(0)
+
+    for cell_type in output2.sdc["cells"]:
+        cells = output2.sdc["cells"][cell_type]
+        for row, col in cells:
+            assert row < table2.shape[0], (
+                f"Row {row} out of bounds for {cell_type} cell in table shape {table2.shape}"
+            )
+            assert col < table2.shape[1], (
+                f"Column {col} out of bounds for {cell_type} cell in table shape {table2.shape}"
+            )
+
+            value = table2.iloc[row, col]
+            if cell_type in ["threshold", "p-ratio", "nk-rule"]:
+                assert pd.isna(value), (
+                    f"Cell at ({row}, {col}) in {cell_type} should be NaN but is {value}"
+                )