Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion policyengine_us_data/calibration/target_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ include:
- variable: household_count
geo_level: district

# === DISTRICT — SNAP household counts (ACS S2201) ===
- variable: household_count
geo_level: district
domain_variable: snap

# === DISTRICT — dollar targets ===
- variable: adjusted_gross_income
geo_level: district
Expand Down Expand Up @@ -42,13 +47,33 @@ include:
geo_level: state
- variable: adjusted_gross_income
geo_level: state

# === STATE — fine AGI bracket targets (stubs 9/10 from in55cmcsv) ===
- variable: person_count
geo_level: state
domain_variable: adjusted_gross_income
- variable: adjusted_gross_income
geo_level: state
domain_variable: adjusted_gross_income
# REMOVED: state_income_tax — ETL hardcodes $0 for WA and NH, but
# PolicyEngine correctly computes non-zero tax (WA capital gains tax,
# NH interest/dividends tax). The $0 targets produce catastrophic loss
# that crushes WA/NH weights to zero. Fix the ETL before re-enabling.
# - variable: state_income_tax
# geo_level: state

# === NATIONAL — fine AGI bracket targets (Table 1.4) ===
- variable: tax_unit_count
geo_level: national
domain_variable: adjusted_gross_income
- variable: adjusted_gross_income
geo_level: national
domain_variable: adjusted_gross_income

# === NATIONAL — wealth target (Federal Reserve SCF, no filer filter) ===
- variable: net_worth
geo_level: national

# === NATIONAL — aggregate dollar targets ===
- variable: adjusted_gross_income
geo_level: national
Expand Down Expand Up @@ -164,11 +189,15 @@ include:
- variable: qualified_business_income_deduction
geo_level: national

# === NATIONAL — CBO income tax target (re-enabled: 22% error < 54% unconstrained) ===
- variable: income_tax_positive
geo_level: national

# NOT INCLUDED — high error or tension (from prior validation)
# =====================================================================
# dividend_income (26%, tension), qualified_dividend_income (29%, tension),
# eitc by child_count (14-77%, tension), rental_income (20%),
# income_tax_before_credits (21%), income_tax_positive (22%),
# income_tax_before_credits (21%),
# salt SOI (102%), taxable_interest_income (61%),
# tax_exempt_interest_income (61%), taxable_ira_distributions (68%),
# taxable_social_security (55%), person_count by AGI bins (100%)
212 changes: 212 additions & 0 deletions policyengine_us_data/db/etl_irs_soi.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@
save_bytes,
)
from policyengine_us_data.utils.soi import get_tracked_soi_row
from policyengine_us_data.storage.calibration_targets.pull_soi_targets import (
STATE_ABBR_TO_FIPS,
)
from policyengine_us_data.storage.calibration_targets.refresh_soi_table_targets import (
_load_workbook,
_scaled_cell,
)

logger = logging.getLogger(__name__)

Expand All @@ -57,6 +64,33 @@
9: (500_000, np.inf), # $500,000 or more
}

STATE_FINE_AGI_STUBS = {
9: (500_000, 1_000_000), # $500,000 under $1,000,000
10: (1_000_000, np.inf), # $1,000,000 or more
}

NATIONAL_FINE_AGI_BRACKETS = {
23: (500_000, 1_000_000), # Table 1.4 row 23
24: (1_000_000, 1_500_000), # row 24
25: (1_500_000, 2_000_000), # row 25
26: (2_000_000, 5_000_000), # row 26
27: (5_000_000, 10_000_000), # row 27
28: (10_000_000, np.inf), # row 28
}


def _skip_coarse_state_agi_person_count_target(geo_type: str, agi_stub: int) -> bool:
"""Skip the coarse state 500k+ count target when fine state bins are loaded.

The standard geography-file SOI feed only has a top-coded state AGI stub 9
(500k+). We separately load `in55cmcsv`, which splits that state tail into
500k-1m and 1m+. Keeping the coarse state count target alongside the fine
rows would double-constrain the same top-tail population in calibration.
"""

return geo_type == "state" and agi_stub == 9


# These variables map cleanly from Publication 1304 aggregate tables to the
# existing national IRS-SOI domain strata. We intentionally leave `aca_ptc`
# and `refundable_ctc` on the geography-file path for now because the
Expand Down Expand Up @@ -396,6 +430,179 @@ def load_national_workbook_soi_targets(
)


def extract_state_fine_agi_data(year: int) -> pd.DataFrame:
"""Download the state-level SOI file (in55cmcsv) with stubs 9 and 10."""
year_prefix = _year_prefix(year)
cache_file = f"irs_soi_{year_prefix}in55cmcsv.csv"
if is_cached(cache_file):
logger.info(f"Using cached {cache_file}")
df = pd.read_csv(cache_path(cache_file), thousands=",")
else:
import requests

url = f"https://www.irs.gov/pub/irs-soi/{year_prefix}in55cmcsv.csv"
response = requests.get(url)
response.raise_for_status()
save_bytes(cache_file, response.content)
df = pd.read_csv(cache_path(cache_file), thousands=",")

df = df[df["AGI_STUB"].isin(STATE_FINE_AGI_STUBS.keys())]
df = df[df["STATE"].isin(STATE_ABBR_TO_FIPS.keys())]
return df


def load_state_fine_agi_targets(
session: Session, filer_strata: dict, year: int
) -> None:
"""Create strata and targets for state-level fine AGI brackets (stubs 9/10)."""
df = extract_state_fine_agi_data(year)

for _, row in df.iterrows():
state_abbr = row["STATE"]
stub = int(row["AGI_STUB"])
fips_str = STATE_ABBR_TO_FIPS[state_abbr]
fips_int = int(fips_str)
lower, upper = STATE_FINE_AGI_STUBS[stub]

parent_stratum_id = filer_strata["state"][fips_int]
note = f"State FIPS {fips_int} filers, AGI >= {lower}, AGI < {upper}"

existing = (
session.query(Stratum)
.filter(
Stratum.parent_stratum_id == parent_stratum_id,
Stratum.notes == note,
)
.first()
)

if existing:
stratum = existing
else:
stratum = Stratum(
parent_stratum_id=parent_stratum_id,
notes=note,
)
stratum.constraints_rel.extend(
[
StratumConstraint(
constraint_variable="tax_unit_is_filer",
operation="==",
value="1",
),
StratumConstraint(
constraint_variable="state_fips",
operation="==",
value=str(fips_int),
),
StratumConstraint(
constraint_variable="adjusted_gross_income",
operation=">=",
value=str(lower),
),
StratumConstraint(
constraint_variable="adjusted_gross_income",
operation="<",
value=str(upper),
),
]
)
session.add(stratum)
session.flush()

person_count = float(row["N2"])
agi_amount = float(row["A00100"]) * 1000

_upsert_target(
session,
stratum_id=stratum.stratum_id,
variable="person_count",
period=year,
value=person_count,
source="IRS SOI",
notes=f"State fine AGI stub {stub} from in55cmcsv",
)
_upsert_target(
session,
stratum_id=stratum.stratum_id,
variable="adjusted_gross_income",
period=year,
value=agi_amount,
source="IRS SOI",
notes=f"State fine AGI stub {stub} from in55cmcsv",
)


def load_national_fine_agi_targets(
session: Session, national_filer_stratum_id: int, target_year: int
) -> None:
"""Create strata and targets for national fine AGI brackets from Table 1.4."""
workbook = _load_workbook("Table 1.4", target_year)

for excel_row, (lower, upper) in NATIONAL_FINE_AGI_BRACKETS.items():
note = f"National filers, AGI >= {lower}, AGI < {upper}"

existing = (
session.query(Stratum)
.filter(
Stratum.parent_stratum_id == national_filer_stratum_id,
Stratum.notes == note,
)
.first()
)

if existing:
stratum = existing
else:
stratum = Stratum(
parent_stratum_id=national_filer_stratum_id,
notes=note,
)
stratum.constraints_rel.extend(
[
StratumConstraint(
constraint_variable="tax_unit_is_filer",
operation="==",
value="1",
),
StratumConstraint(
constraint_variable="adjusted_gross_income",
operation=">=",
value=str(lower),
),
StratumConstraint(
constraint_variable="adjusted_gross_income",
operation="<",
value=str(upper),
),
]
)
session.add(stratum)
session.flush()

count_value = _scaled_cell(workbook, excel_row, "B", is_count=True)
agi_value = _scaled_cell(workbook, excel_row, "C", is_count=False)

_upsert_target(
session,
stratum_id=stratum.stratum_id,
variable="tax_unit_count",
period=target_year,
value=count_value,
source="IRS SOI",
notes=f"Table 1.4 row {excel_row} fine AGI bracket",
)
_upsert_target(
session,
stratum_id=stratum.stratum_id,
variable="adjusted_gross_income",
period=target_year,
value=agi_value,
source="IRS SOI",
notes=f"Table 1.4 row {excel_row} fine AGI bracket",
)


def transform_soi_data(raw_df):

TARGETS = [
Expand Down Expand Up @@ -645,7 +852,9 @@ def load_soi_data(long_dfs, year, national_year: Optional[int] = None):
filer_strata["national"],
national_year,
)
load_national_fine_agi_targets(session, filer_strata["national"], national_year)

load_state_fine_agi_targets(session, filer_strata, year)
session.commit()

# Load EITC data --------------------------------------------------------
Expand Down Expand Up @@ -1048,6 +1257,9 @@ def load_soi_data(long_dfs, year, national_year: Optional[int] = None):
geo_info = parse_ucgid(ucgid_i)
person_count = agi_df.iloc[i][["target_value"]].values[0]

if _skip_coarse_state_agi_person_count_target(geo_info["type"], agi_stub):
continue

if geo_info["type"] == "state":
parent_stratum_id = filer_strata["state"][geo_info["state_fips"]]
note = f"State FIPS {geo_info['state_fips']} filers, AGI >= {agi_income_lower}, AGI < {agi_income_upper}"
Expand Down
22 changes: 8 additions & 14 deletions tests/integration/test_enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,27 +283,21 @@ def test_immigration_status_diversity():
"""Test that immigration statuses show appropriate diversity (not all citizens)."""
from policyengine_us_data.datasets.cps import EnhancedCPS_2024
from policyengine_us import Microsimulation
import numpy as np

sim = Microsimulation(dataset=EnhancedCPS_2024)

# Get immigration status for all persons (already weighted MicroSeries)
# Get immigration status for all persons (weighted MicroSeries)
immigration_status = sim.calculate("immigration_status", 2024)

# Count different statuses
unique_statuses, counts = np.unique(immigration_status, return_counts=True)

# Calculate percentages using the weights directly
total_population = len(immigration_status)
status_percentages = {}
# Weighted counts by status
weighted_counts = immigration_status.weights.groupby(immigration_status).sum()
total_weighted = weighted_counts.sum()

for status, count in zip(unique_statuses, counts):
pct = 100 * count / total_population
status_percentages[status] = pct
print(f" {status}: {count:,} ({pct:.1f}%)")
for status, wt in weighted_counts.items():
pct = 100 * wt / total_weighted
print(f" {status}: {wt:,.0f} ({pct:.1f}%)")

# Test that not everyone is a citizen (would indicate default value being used)
citizen_pct = status_percentages.get("CITIZEN", 0)
citizen_pct = 100 * weighted_counts.get("CITIZEN", 0) / total_weighted

# Fail if more than 99% are citizens (indicating the default is being used)
assert citizen_pct < 99, (
Expand Down
8 changes: 8 additions & 0 deletions tests/unit/test_etl_irs_soi_overlay.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
create_database,
)
from policyengine_us_data.db.etl_irs_soi import (
_skip_coarse_state_agi_person_count_target,
_get_or_create_national_domain_stratum,
_upsert_target,
load_national_workbook_soi_targets,
Expand Down Expand Up @@ -180,3 +181,10 @@ def fake_get_tracked_soi_row(variable, requested_year, **kwargs):
assert len(count_rows) == 1
assert int(count_rows.iloc[0]["period"]) == 2023
assert float(count_rows.iloc[0]["value"]) == 50.0


def test_skip_coarse_state_agi_person_count_target_only_for_state_stub_9():
assert _skip_coarse_state_agi_person_count_target("state", 9) is True
assert _skip_coarse_state_agi_person_count_target("state", 8) is False
assert _skip_coarse_state_agi_person_count_target("district", 9) is False
assert _skip_coarse_state_agi_person_count_target("national", 9) is False
Loading