Skip to content
Closed
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
86b2eb2
feat: add summary generation and export functionality for results
JessUWE Feb 19, 2026
ea0a63e
test: add coverage tests for table info extraction
JessUWE Feb 19, 2026
f3c6d96
fix coverage
JessUWE Feb 19, 2026
f9ef74e
test: add coverage for empty summary edge case (line 595)
JessUWE Feb 19, 2026
47359b2
test: remove unused variable idx_type
JessUWE Feb 19, 2026
f817df2
test: fix unused variable warnings by using underscore
JessUWE Feb 19, 2026
e8bc926
feat: implement session summary with differencing risk detection
JessUWE Mar 5, 2026
e3665ff
feat(record): update warning message
JessUWE Mar 5, 2026
5f14a04
Merge branch 'main' into feature/224-session-summary
JessUWE Mar 5, 2026
404bba3
Add generate_summary() to provide high-level output overview for chec…
JessUWE Mar 5, 2026
fcbe9fa
- Add multi-layered 'DO NOT RELEASE' warnings (filename, comment)
JessUWE Mar 5, 2026
4c16dcd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 5, 2026
e1271ae
Remove tests for index and columns name extraction
JessUWE Mar 6, 2026
6a96fd2
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 6, 2026
c2cd7b6
refactor: remove unreachable elif branches and unnecessary tests
JessUWE Mar 6, 2026
ee46528
Fixes issue where tables with identical variables but different
JessUWE Mar 10, 2026
0202e39
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 10, 2026
61ab307
Fixes issue where tables with identical variables but different
JessUWE Mar 10, 2026
f7c8937
Merge branch 'feature/224-session-summary' of https://github.com/AI-S…
JessUWE Mar 10, 2026
603db30
fix: correct differencing risk detection for suppression settings
JessUWE Mar 10, 2026
f72f049
fix: correct differencing risk detection for suppression settings
JessUWE Mar 10, 2026
55c3320
fix: correct differencing risk detection for suppression settings
JessUWE Mar 10, 2026
6799fab
fix: correct differencing risk detection for suppression settings
JessUWE Mar 10, 2026
c6524a5
refactor: simplify test_extract_table_info_with_numeric_data and remo…
JessUWE Mar 10, 2026
7234259
Merge branch 'feature/224-session-summary' of https://github.com/AI-S…
JessUWE Mar 10, 2026
a0116b8
refactor: simplify docstring for generate_variable_matrix_table and r…
JessUWE Mar 10, 2026
1238fdd
Add per-file ignores for acro_stata_parser.py
jim-smith Mar 11, 2026
5f18e0c
fix: resolve code review issues in session summary implementation
JessUWE Mar 12, 2026
10b8ccd
Merge main branch into feature/224-session-summary
JessUWE Mar 26, 2026
8c30d0d
Refactor Record class and improve variable extraction
JessUWE Mar 26, 2026
b4a3871
Enhance assertions for summary DataFrame variables
JessUWE Mar 26, 2026
4e28a64
refactor: add regression variable extraction functions and tests
JessUWE Mar 26, 2026
9002c7f
Refactor test_initial.py and improve documentation
JessUWE Mar 27, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
314 changes: 313 additions & 1 deletion acro/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,298 @@ def validate_outputs(self) -> None:
)
record.exception = input("")

def _extract_table_info(
self,
output: list,
method: str, # noqa: ARG002
properties: dict, # noqa: ARG002
) -> tuple[list[str], int]:
"""Extract variables and total records from table output.

Parameters
----------
output : list
The output to extract information from.
method : str
The method used to generate the output.
properties : dict
Properties dictionary (unused but kept for compatibility).

Returns
-------
tuple[list[str], int]
Variables and total record count.
"""
variables: list[str] = []
total_records: int = 0

for table in output:
if isinstance(table, DataFrame):
if hasattr(table.index, "names") and any(table.index.names):
variables.extend(str(n) for n in table.index.names if n is not None)

if hasattr(table.columns, "names") and any(table.columns.names):
variables.extend(
str(n) for n in table.columns.names if n is not None
)

try:
# Count non-NaN cells for record count
cell_sum = table.values[~pd.isna(table.values)].sum()
if cell_sum > 0:
total_records = int(cell_sum)
else:
total_records = int(table.shape[0] * table.shape[1])
except (TypeError, ValueError): # pragma: no cover
pass

return variables, total_records

def _extract_regression_info(self, output: list) -> tuple[list[str], int]:
"""Extract variables and total records from regression output.

Parameters
----------
output : list
The output to extract information from.

Returns
-------
tuple[list[str], int]
Variables and total record count.
"""
variables: list[str] = []
total_records: int = 0

for table in output:
if isinstance(table, DataFrame):
for idx in table.index:
idx_str = str(idx)
if "no. observations" in idx_str.lower():
try:
val = table.loc[idx].dropna().iloc[0]
total_records = int(float(val))
except (ValueError, TypeError, IndexError):
pass
break

return variables, total_records

def _mark_diff_risk(self, summary_df: DataFrame) -> DataFrame:
"""Mark outputs with differencing risk.

Parameters
----------
summary_df : DataFrame
The summary DataFrame to update.

Returns
-------
DataFrame
Updated summary DataFrame with diff_risk column.
"""
if summary_df.empty:
summary_df["diff_risk"] = pd.Series(dtype=bool)
else:
summary_df["diff_risk"] = False
table_mask = summary_df["type"] == "table"
table_outputs = summary_df.loc[table_mask]
if not table_outputs.empty:
for _, group in table_outputs.groupby("variables"):
if len(group) > 1:
# Check for different suppression settings
suppressions = group["suppression"].unique()
# Risk if same variables with different suppression settings
if len(suppressions) > 1:
summary_df.loc[group.index, "diff_risk"] = True

return summary_df

def _extract_all_variables(self) -> list[str]:
"""Extract all unique variables across all outputs.

Returns
-------
list[str]
Sorted list of unique variable names.
"""
all_variables: set[str] = set()

for rec in self.results.values():
if rec.output_type == "custom":
continue
variables = self._get_output_variables(rec)
all_variables.update(variables)

return sorted(all_variables)

def _get_output_variables(self, rec: Record) -> list[str]:
"""Extract variables from a single record.

Parameters
----------
rec : Record
The record to extract variables from.

Returns
-------
list[str]
List of variable names.
"""
method = rec.properties.get("method", rec.output_type)
variables: list[str] = []

if rec.output_type == "table":
variables, _ = self._extract_table_info(rec.output, method, rec.properties)
elif rec.output_type == "regression":
variables, _ = self._extract_regression_info(rec.output)
dof = rec.properties.get("dof")
if dof is not None:
variables.append(f"dof={dof}")

return variables

def _build_variable_matrix(self, summary_df: DataFrame) -> DataFrame:
"""Build a variable-output matrix showing variable usage.

Parameters
----------
summary_df : DataFrame
The base summary DataFrame.

Returns
-------
DataFrame
Summary with binary variable columns added.
"""
all_variables = self._extract_all_variables()

if not all_variables:
return summary_df

# Create binary columns for each variable
for var in all_variables:
summary_df[var] = summary_df["variables"].apply(
lambda vars_str, v=var: 1 if v in vars_str.split("; ") else 0
)

return summary_df

def generate_variable_matrix_table(self) -> DataFrame:
"""Generate a clean variable-output matrix table.

Creates a table with one row per output and one column per variable,
plus an output_type column. Binary values indicate variable usage.

Returns
-------
DataFrame
Variable matrix table with columns: output_type, var1, var2, ...
"""
all_variables = self._extract_all_variables()
matrix_rows = []

for uid, rec in self.results.items():
if rec.output_type == "custom":
continue # pragma: no cover

variables = self._get_output_variables(rec)

row: dict[str, Any] = {"output_id": uid, "output_type": rec.output_type}
for var in all_variables:
row[var] = 1 if var in variables else 0

matrix_rows.append(row)

return DataFrame(matrix_rows)

def generate_summary(self) -> DataFrame:
"""Generate a summary DataFrame of all outputs in the session.

Provides output checkers with a high-level overview of all outputs,
including what method was used, what variables are involved, the
total record count, and whether there is a differencing risk.

Returns
-------
DataFrame
Summary of all outputs with columns: id, method, status, type,
command, summary, variables, total_records, suppression,
timestamp, diff_risk.
command, summary, variables, total_records, suppression,
timestamp, diff_risk.
"""
rows = []
for uid, rec in self.results.items():
if rec.output_type == "custom":
continue
method = rec.properties.get("method", rec.output_type)
variables: list[str] = []
total_records: int = 0

if rec.output_type == "table":
variables, total_records = self._extract_table_info(
rec.output, method, rec.properties
)
variables, total_records = self._extract_table_info(
rec.output, method, rec.properties
)
elif rec.output_type == "regression":
variables, total_records = self._extract_regression_info(rec.output)
dof = rec.properties.get("dof")
if dof is not None:
variables.append(f"dof={dof}")

variables_str = "; ".join(variables) if variables else ""

suppression: bool = False
if isinstance(rec.sdc, dict) and "summary" in rec.sdc:
suppression = bool(rec.sdc["summary"].get("suppressed", False))

rows.append(
{
"id": uid,
"method": method,
"status": rec.status,
"type": rec.output_type,
"command": rec.command,
"summary": rec.summary,
"variables": variables_str,
"total_records": total_records,
"suppression": suppression,
"timestamp": rec.timestamp,
}
)

summary_df = DataFrame(rows)
summary_df = self._mark_diff_risk(summary_df)
summary_df = self._build_variable_matrix(summary_df)

return summary_df

def add_summary_to_results(self) -> None:
"""Add the summary DataFrame as a custom output to results.

This generates a summary of all outputs in the session with metadata
about variables, record counts, and differencing risk. The file is
marked with a clear warning not to release.
"""
summary_df = self.generate_summary()
if summary_df.empty:
return

os.makedirs("acro_artifacts", exist_ok=True)
# Use explicit filename to indicate this should not be released
summary_path = os.path.normpath(
"acro_artifacts/DO_NOT_RELEASE_session_summary.csv"
)
summary_df.to_csv(summary_path, index=False)

self.add_custom(
summary_path,
"WARNING: DO NOT RELEASE - Session summary for output checker use only",
)

def finalise(self, path: str, ext: str, interactive: bool = False) -> None:
"""Create a results file for checking.

Expand All @@ -445,6 +737,7 @@ def finalise(self, path: str, ext: str, interactive: bool = False) -> None:
logger.debug("finalise()")
if interactive:
self.validate_outputs()
self.add_summary_to_results()
if ext == "json":
self.finalise_json(path)
elif ext == "xlsx":
Expand Down Expand Up @@ -484,7 +777,19 @@ def finalise_json(self, path: str) -> None:
for file in files:
outputs[key]["files"].append({"name": file, "sdc": val.sdc})

results: dict[str, str | dict] = {"version": __version__, "results": outputs}
# Generate and include session summary for output checkers
summary_df = self.generate_summary()
session_summary = {
"DO_NOT_RELEASE": True,
"purpose": "Session summary for output checker use only",
"data": json.loads(summary_df.to_json(orient="records")),
}

results: dict = {
"version": __version__,
"results": outputs,
"session_summary": session_summary,
}
filename: str = os.path.normpath(f"{path}/results.json")
try:
with open(filename, "w", newline="", encoding="utf-8") as handle:
Expand Down Expand Up @@ -513,6 +818,13 @@ def finalise_excel(self, path: str) -> None:
with pd.ExcelWriter( # pylint: disable=abstract-class-instantiated
filename, engine="openpyxl"
) as writer:
# summary sheet
summary_df = self.generate_summary()
if not summary_df.empty:
summary_df.to_excel(
writer, sheet_name="summary", index=False, startrow=0
)

# description sheet
sheet: list[str] = []
summary: list[str] = []
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ lint.ignore = [

[tool.ruff.lint.per-file-ignores]
"test/*.py" = ["ANN"]
"acro_stata_parser.py" = ["C901"]

[tool.ruff.lint.pep8-naming]
extend-ignore-names = ["X", "X_train", "X_predict"]
Expand Down
Loading
Loading