Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/711.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Split legacy national CTC calibration into separate refundable and nonrefundable IRS SOI amount and recipient-count targets, added DB-backed nonrefundable CTC targets for both national and unified district calibration, and fixed recursive package imports so database creation scripts and the national validation tooling can import cleanly in fresh environments. The national validator now also reports CTC totals and grouped diagnostics by AGI band and filing status, its advertised `--hf-path` mode now completes structural checks against published Hugging Face H5 artifacts, and CPS-derived datasets now emit `has_tin` plus a temporary `has_itin` compatibility alias derived from identification status.
38 changes: 37 additions & 1 deletion policyengine_us_data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,38 @@
from .datasets import *
from importlib import import_module

from .geography import ZIP_CODE_DATASET

_LAZY_EXPORTS = {
"CPS_2024": (
"policyengine_us_data.datasets.cps.cps",
"CPS_2024",
),
"EnhancedCPS_2024": (
"policyengine_us_data.datasets.cps.enhanced_cps",
"EnhancedCPS_2024",
),
"ExtendedCPS_2024": (
"policyengine_us_data.datasets.cps.extended_cps",
"ExtendedCPS_2024",
),
"PUF_2024": (
"policyengine_us_data.datasets.puf.puf",
"PUF_2024",
),
}

__all__ = ["ZIP_CODE_DATASET", *_LAZY_EXPORTS]


def __getattr__(name: str):
if name not in _LAZY_EXPORTS:
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

module_name, attribute_name = _LAZY_EXPORTS[name]
value = getattr(import_module(module_name), attribute_name)
globals()[name] = value
return value


def __dir__():
return sorted(set(globals()) | set(_LAZY_EXPORTS))
23 changes: 21 additions & 2 deletions policyengine_us_data/calibration/check_staging_sums.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from policyengine_us_data.calibration.calibration_utils import (
STATE_CODES,
)
from policyengine_us_data.db.etl_irs_soi import get_national_geography_soi_target

STATE_ABBRS = sorted(STATE_CODES.values())

Expand All @@ -34,6 +35,7 @@
"ssi",
"income_tax_before_credits",
"eitc",
"non_refundable_ctc",
"refundable_ctc",
"real_estate_taxes",
"rent",
Expand All @@ -45,6 +47,24 @@
DEFAULT_HF_PREFIX = "hf://policyengine/policyengine-us-data/staging/states"


def get_reference_summary(reference_year: int = 2024) -> str:
refundable_ctc_target = get_national_geography_soi_target(
"refundable_ctc",
reference_year,
)
non_refundable_ctc_target = get_national_geography_soi_target(
"non_refundable_ctc",
reference_year,
)
return (
" SNAP ~$110B, SSI ~$60B, Social Security ~$1.2T\n"
f" EITC ~$60B, refundable CTC ~${refundable_ctc_target['amount'] / 1e9:.1f}B "
f"(IRS SOI {refundable_ctc_target['source_year']}), "
f"non-refundable CTC ~${non_refundable_ctc_target['amount'] / 1e9:.1f}B "
f"(IRS SOI {non_refundable_ctc_target['source_year']})"
)


def main(argv=None):
parser = argparse.ArgumentParser(
description="Sum key variables across staging state H5 files"
Expand Down Expand Up @@ -110,8 +130,7 @@ def main(argv=None):
print("=" * 70)
print(" US GDP ~$29T, US population ~335M, ~130M households")
print(" Total AGI ~$15T, Employment income ~$10T")
print(" SNAP ~$110B, SSI ~$60B, Social Security ~$1.2T")
print(" EITC ~$60B, CTC ~$120B")
print(get_reference_summary())

if errors:
print(f"\n{len(errors)} states failed:")
Expand Down
145 changes: 145 additions & 0 deletions policyengine_us_data/calibration/ctc_diagnostics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import numpy as np
import pandas as pd

IRS_AGI_BANDS = [
(-np.inf, 1.0, "<$1"),
(1.0, 10_000.0, "$1-$10k"),
(10_000.0, 25_000.0, "$10k-$25k"),
(25_000.0, 50_000.0, "$25k-$50k"),
(50_000.0, 75_000.0, "$50k-$75k"),
(75_000.0, 100_000.0, "$75k-$100k"),
(100_000.0, 200_000.0, "$100k-$200k"),
(200_000.0, 500_000.0, "$200k-$500k"),
(500_000.0, np.inf, "$500k+"),
]

FILING_STATUS_LABELS = {
"SINGLE": "Single",
"HEAD_OF_HOUSEHOLD": "Head of household",
"JOINT": "Joint / surviving spouse",
"SURVIVING_SPOUSE": "Joint / surviving spouse",
"SEPARATE": "Separate",
}

FILING_STATUS_ORDER = [
"Single",
"Head of household",
"Joint / surviving spouse",
"Separate",
"Other",
]

CTC_GROUP_COLUMNS = [
"tax_unit_count",
"ctc_qualifying_children",
"ctc_recipient_count",
"refundable_ctc_recipient_count",
"non_refundable_ctc_recipient_count",
"ctc",
"refundable_ctc",
"non_refundable_ctc",
]


def _assign_agi_bands(adjusted_gross_income: np.ndarray) -> pd.Categorical:
labels = [label for _, _, label in IRS_AGI_BANDS]
agi_band = np.full(len(adjusted_gross_income), labels[-1], dtype=object)
for lower, upper, label in IRS_AGI_BANDS:
mask = (adjusted_gross_income >= lower) & (adjusted_gross_income < upper)
agi_band[mask] = label
return pd.Categorical(agi_band, categories=labels, ordered=True)


def _normalize_filing_status(filing_status: pd.Series) -> pd.Categorical:
labels = [
FILING_STATUS_LABELS.get(str(value), "Other")
for value in filing_status.astype(str)
]
return pd.Categorical(labels, categories=FILING_STATUS_ORDER, ordered=True)


def build_ctc_diagnostic_tables(frame: pd.DataFrame) -> dict[str, pd.DataFrame]:
"""Aggregate weighted CTC diagnostics by AGI band and filing status."""
work = frame.copy()
weights = work["tax_unit_weight"].astype(float).to_numpy()

work["agi_band"] = _assign_agi_bands(
work["adjusted_gross_income"].astype(float).to_numpy()
)
work["filing_status_group"] = _normalize_filing_status(work["filing_status"])

work["tax_unit_count"] = weights
work["ctc_qualifying_children"] = (
work["ctc_qualifying_children"].astype(float).to_numpy() * weights
)
work["ctc_recipient_count"] = (work["ctc"].astype(float).to_numpy() > 0).astype(
float
) * weights
work["refundable_ctc_recipient_count"] = (
work["refundable_ctc"].astype(float).to_numpy() > 0
).astype(float) * weights
work["non_refundable_ctc_recipient_count"] = (
work["non_refundable_ctc"].astype(float).to_numpy() > 0
).astype(float) * weights
work["ctc"] = work["ctc"].astype(float).to_numpy() * weights
work["refundable_ctc"] = work["refundable_ctc"].astype(float).to_numpy() * weights
work["non_refundable_ctc"] = (
work["non_refundable_ctc"].astype(float).to_numpy() * weights
)

by_agi = (
work.groupby("agi_band", observed=False)[CTC_GROUP_COLUMNS]
.sum()
.reset_index()
.rename(columns={"agi_band": "group"})
)
by_filing_status = (
work.groupby("filing_status_group", observed=False)[CTC_GROUP_COLUMNS]
.sum()
.reset_index()
.rename(columns={"filing_status_group": "group"})
)

return {
"by_agi_band": by_agi,
"by_filing_status": by_filing_status,
}


def create_ctc_diagnostic_tables(sim) -> dict[str, pd.DataFrame]:
"""Calculate weighted CTC diagnostic tables from a microsimulation."""
frame = pd.DataFrame(
{
"adjusted_gross_income": sim.calculate("adjusted_gross_income").values,
"filing_status": sim.calculate("filing_status").values,
"tax_unit_weight": sim.calculate("tax_unit_weight").values,
"ctc_qualifying_children": sim.calculate("ctc_qualifying_children").values,
"ctc": sim.calculate("ctc").values,
"refundable_ctc": sim.calculate("refundable_ctc").values,
"non_refundable_ctc": sim.calculate("non_refundable_ctc").values,
}
)
return build_ctc_diagnostic_tables(frame)


def _format_count(value: float) -> str:
return f"{value / 1e6:,.2f}M"


def _format_amount(value: float) -> str:
return f"${value / 1e9:,.1f}B"


def format_ctc_diagnostic_table(table: pd.DataFrame) -> str:
display = table.copy()
for column in [
"tax_unit_count",
"ctc_qualifying_children",
"ctc_recipient_count",
"refundable_ctc_recipient_count",
"non_refundable_ctc_recipient_count",
]:
display[column] = display[column].map(_format_count)
for column in ["ctc", "refundable_ctc", "non_refundable_ctc"]:
display[column] = display[column].map(_format_amount)
return display.to_string(index=False)
8 changes: 8 additions & 0 deletions policyengine_us_data/calibration/target_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ include:
geo_level: district
- variable: refundable_ctc
geo_level: district
- variable: non_refundable_ctc
geo_level: district
- variable: unemployment_compensation
geo_level: district

Expand Down Expand Up @@ -148,6 +150,9 @@ include:
- variable: refundable_ctc
geo_level: national
domain_variable: refundable_ctc
- variable: non_refundable_ctc
geo_level: national
domain_variable: non_refundable_ctc
- variable: self_employment_income
geo_level: national
domain_variable: self_employment_income
Expand All @@ -168,6 +173,9 @@ include:
- variable: tax_unit_count
geo_level: national
domain_variable: refundable_ctc
- variable: tax_unit_count
geo_level: national
domain_variable: non_refundable_ctc

# === NATIONAL — SOI deduction totals (non-reform) ===
- variable: medical_expense_deduction
Expand Down
1 change: 1 addition & 0 deletions policyengine_us_data/calibration/unified_calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from typing import Optional

import numpy as np
import pandas as pd

logging.basicConfig(
level=logging.INFO,
Expand Down
Loading
Loading