PolicyEngine · MaxGhenis · Apr 10, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/changelog.d/711.fixed.md b/changelog.d/711.fixed.md
@@ -0,0 +1 @@
+Split legacy national CTC calibration into separate refundable and nonrefundable IRS SOI amount and recipient-count targets, added DB-backed nonrefundable CTC targets for both national and unified district calibration, and fixed recursive package imports so database creation scripts and the national validation tooling can import cleanly in fresh environments. The national validator now also reports CTC totals and grouped diagnostics by AGI band and filing status, its advertised `--hf-path` mode now completes structural checks against published Hugging Face H5 artifacts, and CPS-derived datasets now emit `has_tin` plus a temporary `has_itin` compatibility alias derived from identification status.
diff --git a/policyengine_us_data/__init__.py b/policyengine_us_data/__init__.py
@@ -1,2 +1,38 @@
-from .datasets import *
+from importlib import import_module
+
 from .geography import ZIP_CODE_DATASET
+
+_LAZY_EXPORTS = {
+    "CPS_2024": (
+        "policyengine_us_data.datasets.cps.cps",
+        "CPS_2024",
+    ),
+    "EnhancedCPS_2024": (
+        "policyengine_us_data.datasets.cps.enhanced_cps",
+        "EnhancedCPS_2024",
+    ),
+    "ExtendedCPS_2024": (
+        "policyengine_us_data.datasets.cps.extended_cps",
+        "ExtendedCPS_2024",
+    ),
+    "PUF_2024": (
+        "policyengine_us_data.datasets.puf.puf",
+        "PUF_2024",
+    ),
+}
+
+__all__ = ["ZIP_CODE_DATASET", *_LAZY_EXPORTS]
+
+
+def __getattr__(name: str):
+    if name not in _LAZY_EXPORTS:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+    module_name, attribute_name = _LAZY_EXPORTS[name]
+    value = getattr(import_module(module_name), attribute_name)
+    globals()[name] = value
+    return value
+
+
+def __dir__():
+    return sorted(set(globals()) | set(_LAZY_EXPORTS))
diff --git a/policyengine_us_data/calibration/check_staging_sums.py b/policyengine_us_data/calibration/check_staging_sums.py
@@ -16,6 +16,7 @@
 from policyengine_us_data.calibration.calibration_utils import (
     STATE_CODES,
 )
+from policyengine_us_data.db.etl_irs_soi import get_national_geography_soi_target
 
 STATE_ABBRS = sorted(STATE_CODES.values())
 
@@ -34,6 +35,7 @@
     "ssi",
     "income_tax_before_credits",
     "eitc",
+    "non_refundable_ctc",
     "refundable_ctc",
     "real_estate_taxes",
     "rent",
@@ -45,6 +47,24 @@
 DEFAULT_HF_PREFIX = "hf://policyengine/policyengine-us-data/staging/states"
 
 
+def get_reference_summary(reference_year: int = 2024) -> str:
+    refundable_ctc_target = get_national_geography_soi_target(
+        "refundable_ctc",
+        reference_year,
+    )
+    non_refundable_ctc_target = get_national_geography_soi_target(
+        "non_refundable_ctc",
+        reference_year,
+    )
+    return (
+        "  SNAP ~$110B, SSI ~$60B, Social Security ~$1.2T\n"
+        f"  EITC ~$60B, refundable CTC ~${refundable_ctc_target['amount'] / 1e9:.1f}B "
+        f"(IRS SOI {refundable_ctc_target['source_year']}), "
+        f"non-refundable CTC ~${non_refundable_ctc_target['amount'] / 1e9:.1f}B "
+        f"(IRS SOI {non_refundable_ctc_target['source_year']})"
+    )
+
+
 def main(argv=None):
     parser = argparse.ArgumentParser(
         description="Sum key variables across staging state H5 files"
@@ -110,8 +130,7 @@ def main(argv=None):
     print("=" * 70)
     print("  US GDP ~$29T, US population ~335M, ~130M households")
     print("  Total AGI ~$15T, Employment income ~$10T")
-    print("  SNAP ~$110B, SSI ~$60B, Social Security ~$1.2T")
-    print("  EITC ~$60B, CTC ~$120B")
+    print(get_reference_summary())
 
     if errors:
         print(f"\n{len(errors)} states failed:")

diff --git a/policyengine_us_data/calibration/ctc_diagnostics.py b/policyengine_us_data/calibration/ctc_diagnostics.py
@@ -0,0 +1,145 @@
+import numpy as np
+import pandas as pd
+
+IRS_AGI_BANDS = [
+    (-np.inf, 1.0, "<$1"),
+    (1.0, 10_000.0, "$1-$10k"),
+    (10_000.0, 25_000.0, "$10k-$25k"),
+    (25_000.0, 50_000.0, "$25k-$50k"),
+    (50_000.0, 75_000.0, "$50k-$75k"),
+    (75_000.0, 100_000.0, "$75k-$100k"),
+    (100_000.0, 200_000.0, "$100k-$200k"),
+    (200_000.0, 500_000.0, "$200k-$500k"),
+    (500_000.0, np.inf, "$500k+"),
+]
+
+FILING_STATUS_LABELS = {
+    "SINGLE": "Single",
+    "HEAD_OF_HOUSEHOLD": "Head of household",
+    "JOINT": "Joint / surviving spouse",
+    "SURVIVING_SPOUSE": "Joint / surviving spouse",
+    "SEPARATE": "Separate",
+}
+
+FILING_STATUS_ORDER = [
+    "Single",
+    "Head of household",
+    "Joint / surviving spouse",
+    "Separate",
+    "Other",
+]
+
+CTC_GROUP_COLUMNS = [
+    "tax_unit_count",
+    "ctc_qualifying_children",
+    "ctc_recipient_count",
+    "refundable_ctc_recipient_count",
+    "non_refundable_ctc_recipient_count",
+    "ctc",
+    "refundable_ctc",
+    "non_refundable_ctc",
+]
+
+
+def _assign_agi_bands(adjusted_gross_income: np.ndarray) -> pd.Categorical:
+    labels = [label for _, _, label in IRS_AGI_BANDS]
+    agi_band = np.full(len(adjusted_gross_income), labels[-1], dtype=object)
+    for lower, upper, label in IRS_AGI_BANDS:
+        mask = (adjusted_gross_income >= lower) & (adjusted_gross_income < upper)
+        agi_band[mask] = label
+    return pd.Categorical(agi_band, categories=labels, ordered=True)
+
+
+def _normalize_filing_status(filing_status: pd.Series) -> pd.Categorical:
+    labels = [
+        FILING_STATUS_LABELS.get(str(value), "Other")
+        for value in filing_status.astype(str)
+    ]
+    return pd.Categorical(labels, categories=FILING_STATUS_ORDER, ordered=True)
+
+
+def build_ctc_diagnostic_tables(frame: pd.DataFrame) -> dict[str, pd.DataFrame]:
+    """Aggregate weighted CTC diagnostics by AGI band and filing status."""
+    work = frame.copy()
+    weights = work["tax_unit_weight"].astype(float).to_numpy()
+
+    work["agi_band"] = _assign_agi_bands(
+        work["adjusted_gross_income"].astype(float).to_numpy()
+    )
+    work["filing_status_group"] = _normalize_filing_status(work["filing_status"])
+
+    work["tax_unit_count"] = weights
+    work["ctc_qualifying_children"] = (
+        work["ctc_qualifying_children"].astype(float).to_numpy() * weights
+    )
+    work["ctc_recipient_count"] = (work["ctc"].astype(float).to_numpy() > 0).astype(
+        float
+    ) * weights
+    work["refundable_ctc_recipient_count"] = (
+        work["refundable_ctc"].astype(float).to_numpy() > 0
+    ).astype(float) * weights
+    work["non_refundable_ctc_recipient_count"] = (
+        work["non_refundable_ctc"].astype(float).to_numpy() > 0
+    ).astype(float) * weights
+    work["ctc"] = work["ctc"].astype(float).to_numpy() * weights
+    work["refundable_ctc"] = work["refundable_ctc"].astype(float).to_numpy() * weights
+    work["non_refundable_ctc"] = (
+        work["non_refundable_ctc"].astype(float).to_numpy() * weights
+    )
+
+    by_agi = (
+        work.groupby("agi_band", observed=False)[CTC_GROUP_COLUMNS]
+        .sum()
+        .reset_index()
+        .rename(columns={"agi_band": "group"})
+    )
+    by_filing_status = (
+        work.groupby("filing_status_group", observed=False)[CTC_GROUP_COLUMNS]
+        .sum()
+        .reset_index()
+        .rename(columns={"filing_status_group": "group"})
+    )
+
+    return {
+        "by_agi_band": by_agi,
+        "by_filing_status": by_filing_status,
+    }
+
+
+def create_ctc_diagnostic_tables(sim) -> dict[str, pd.DataFrame]:
+    """Calculate weighted CTC diagnostic tables from a microsimulation."""
+    frame = pd.DataFrame(
+        {
+            "adjusted_gross_income": sim.calculate("adjusted_gross_income").values,
+            "filing_status": sim.calculate("filing_status").values,
+            "tax_unit_weight": sim.calculate("tax_unit_weight").values,
+            "ctc_qualifying_children": sim.calculate("ctc_qualifying_children").values,
+            "ctc": sim.calculate("ctc").values,
+            "refundable_ctc": sim.calculate("refundable_ctc").values,
+            "non_refundable_ctc": sim.calculate("non_refundable_ctc").values,
+        }
+    )
+    return build_ctc_diagnostic_tables(frame)
+
+
+def _format_count(value: float) -> str:
+    return f"{value / 1e6:,.2f}M"
+
+
+def _format_amount(value: float) -> str:
+    return f"${value / 1e9:,.1f}B"
+
+
+def format_ctc_diagnostic_table(table: pd.DataFrame) -> str:
+    display = table.copy()
+    for column in [
+        "tax_unit_count",
+        "ctc_qualifying_children",
+        "ctc_recipient_count",
+        "refundable_ctc_recipient_count",
+        "non_refundable_ctc_recipient_count",
+    ]:
+        display[column] = display[column].map(_format_count)
+    for column in ["ctc", "refundable_ctc", "non_refundable_ctc"]:
+        display[column] = display[column].map(_format_amount)
+    return display.to_string(index=False)
diff --git a/policyengine_us_data/calibration/target_config.yaml b/policyengine_us_data/calibration/target_config.yaml
@@ -28,6 +28,8 @@ include:
     geo_level: district
   - variable: refundable_ctc
     geo_level: district
+  - variable: non_refundable_ctc
+    geo_level: district
   - variable: unemployment_compensation
     geo_level: district
 
@@ -148,6 +150,9 @@ include:
   - variable: refundable_ctc
     geo_level: national
     domain_variable: refundable_ctc
+  - variable: non_refundable_ctc
+    geo_level: national
+    domain_variable: non_refundable_ctc
   - variable: self_employment_income
     geo_level: national
     domain_variable: self_employment_income
@@ -168,6 +173,9 @@ include:
   - variable: tax_unit_count
     geo_level: national
     domain_variable: refundable_ctc
+  - variable: tax_unit_count
+    geo_level: national
+    domain_variable: non_refundable_ctc
 
   # === NATIONAL — SOI deduction totals (non-reform) ===
   - variable: medical_expense_deduction

diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py
@@ -34,6 +34,7 @@
 from typing import Optional
 
 import numpy as np
+import pandas as pd
 
 logging.basicConfig(
     level=logging.INFO,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Split legacy national CTC calibration into separate refundable and nonrefundable IRS SOI amount and recipient-count targets, added DB-backed nonrefundable CTC targets for both national and unified district calibration, and fixed recursive package imports so database creation scripts and the national validation tooling can import cleanly in fresh environments. The national validator now also reports CTC totals and grouped diagnostics by AGI band and filing status, its advertised `--hf-path` mode now completes structural checks against published Hugging Face H5 artifacts, and CPS-derived datasets now emit `has_tin` plus a temporary `has_itin` compatibility alias derived from identification status.