Skip to content

Commit 8d1f57c

Browse files
committed
Refresh local SOI state targets for local calibration
1 parent a84b882 commit 8d1f57c

File tree

5 files changed

+347
-18
lines changed

5 files changed

+347
-18
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added an explicit refresh path and regression coverage for the legacy `agi_state.csv` SOI targets used by local calibration.

policyengine_us_data/storage/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111
• Location: https://www.cms.gov/files/document/effectuated-enrollment-early-snapshot-2025-and-full-year-2024-average.pdf
1212
• Notes: `enrollment` is APTC enrollment by state; `spending` is monthly APTC enrollment multiplied by average monthly APTC for APTC recipients
1313

14+
- **agi_state.csv**
15+
• Source: IRS SOI state data file used by legacy local calibration
16+
• Date: tax year 2022
17+
• Created by: `policyengine_us_data/storage/calibration_targets/refresh_local_agi_state_targets.py`
18+
• Location: https://www.irs.gov/pub/irs-soi/22in55cmcsv.csv
19+
• Notes: This file intentionally keeps the legacy `utils/loss.py` schema (`AL`, `DC`, etc.) instead of the newer `state_AL` geography naming used in `soi.csv`/database overlays. It is separate from `soi_targets.csv`, and it currently lags the national SOI refresh because IRS geographic state SOI files are only published through TY2022.
20+
1421
- **medicaid_enrollment_2024.csv**
1522
• Source: MACPAC Enrollment Tables, FFY 2024
1623
• Date: 2024

policyengine_us_data/storage/calibration_targets/agi_state.csv

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,14 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE
6363
0400000US10,DE,75000.0,100000.0,51060,1,adjusted_gross_income/count
6464
0400000US10,DE,100000.0,200000.0,89920,1,adjusted_gross_income/count
6565
0400000US10,DE,200000.0,500000.0,30280,1,adjusted_gross_income/count
66-
,DC,-inf,1.0,4700,1,adjusted_gross_income/count
67-
,DC,1.0,10000.0,27330,1,adjusted_gross_income/count
68-
,DC,10000.0,25000.0,44140,1,adjusted_gross_income/count
69-
,DC,25000.0,50000.0,65340,1,adjusted_gross_income/count
70-
,DC,50000.0,75000.0,55150,1,adjusted_gross_income/count
71-
,DC,75000.0,100000.0,39010,1,adjusted_gross_income/count
72-
,DC,100000.0,200000.0,66180,1,adjusted_gross_income/count
73-
,DC,200000.0,500000.0,36310,1,adjusted_gross_income/count
66+
0400000US11,DC,-inf,1.0,4700,1,adjusted_gross_income/count
67+
0400000US11,DC,1.0,10000.0,27330,1,adjusted_gross_income/count
68+
0400000US11,DC,10000.0,25000.0,44140,1,adjusted_gross_income/count
69+
0400000US11,DC,25000.0,50000.0,65340,1,adjusted_gross_income/count
70+
0400000US11,DC,50000.0,75000.0,55150,1,adjusted_gross_income/count
71+
0400000US11,DC,75000.0,100000.0,39010,1,adjusted_gross_income/count
72+
0400000US11,DC,100000.0,200000.0,66180,1,adjusted_gross_income/count
73+
0400000US11,DC,200000.0,500000.0,36310,1,adjusted_gross_income/count
7474
0400000US12,FL,-inf,1.0,216880,1,adjusted_gross_income/count
7575
0400000US12,FL,1.0,10000.0,1123740,1,adjusted_gross_income/count
7676
0400000US12,FL,10000.0,25000.0,2180990,1,adjusted_gross_income/count
@@ -414,7 +414,7 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE
414414
0400000US06,CA,500000.0,inf,426810,1,adjusted_gross_income/count
415415
0400000US08,CO,500000.0,inf,51500,1,adjusted_gross_income/count
416416
0400000US09,CT,500000.0,inf,45510,1,adjusted_gross_income/count
417-
,DC,500000.0,inf,10530,1,adjusted_gross_income/count
417+
0400000US11,DC,500000.0,inf,10530,1,adjusted_gross_income/count
418418
0400000US10,DE,500000.0,inf,5350,1,adjusted_gross_income/count
419419
0400000US12,FL,500000.0,inf,197090,1,adjusted_gross_income/count
420420
0400000US13,GA,500000.0,inf,65350,1,adjusted_gross_income/count
@@ -522,14 +522,14 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE
522522
0400000US10,DE,75000.0,100000.0,4427687000,0,adjusted_gross_income/amount
523523
0400000US10,DE,100000.0,200000.0,12401957000,0,adjusted_gross_income/amount
524524
0400000US10,DE,200000.0,500000.0,8502065000,0,adjusted_gross_income/amount
525-
,DC,-inf,1.0,-394257000,0,adjusted_gross_income/amount
526-
,DC,1.0,10000.0,136195000,0,adjusted_gross_income/amount
527-
,DC,10000.0,25000.0,757904000,0,adjusted_gross_income/amount
528-
,DC,25000.0,50000.0,2434747000,0,adjusted_gross_income/amount
529-
,DC,50000.0,75000.0,3419215000,0,adjusted_gross_income/amount
530-
,DC,75000.0,100000.0,3380668000,0,adjusted_gross_income/amount
531-
,DC,100000.0,200000.0,9164382000,0,adjusted_gross_income/amount
532-
,DC,200000.0,500000.0,10828885000,0,adjusted_gross_income/amount
525+
0400000US11,DC,-inf,1.0,-394257000,0,adjusted_gross_income/amount
526+
0400000US11,DC,1.0,10000.0,136195000,0,adjusted_gross_income/amount
527+
0400000US11,DC,10000.0,25000.0,757904000,0,adjusted_gross_income/amount
528+
0400000US11,DC,25000.0,50000.0,2434747000,0,adjusted_gross_income/amount
529+
0400000US11,DC,50000.0,75000.0,3419215000,0,adjusted_gross_income/amount
530+
0400000US11,DC,75000.0,100000.0,3380668000,0,adjusted_gross_income/amount
531+
0400000US11,DC,100000.0,200000.0,9164382000,0,adjusted_gross_income/amount
532+
0400000US11,DC,200000.0,500000.0,10828885000,0,adjusted_gross_income/amount
533533
0400000US12,FL,-inf,1.0,-19196408000,0,adjusted_gross_income/amount
534534
0400000US12,FL,1.0,10000.0,5776254000,0,adjusted_gross_income/amount
535535
0400000US12,FL,10000.0,25000.0,37314354000,0,adjusted_gross_income/amount
@@ -873,7 +873,7 @@ GEO_ID,GEO_NAME,AGI_LOWER_BOUND,AGI_UPPER_BOUND,VALUE,IS_COUNT,VARIABLE
873873
0400000US06,CA,500000.0,inf,613219427000,0,adjusted_gross_income/amount
874874
0400000US08,CO,500000.0,inf,71426453000,0,adjusted_gross_income/amount
875875
0400000US09,CT,500000.0,inf,77248832000,0,adjusted_gross_income/amount
876-
,DC,500000.0,inf,17097350000,0,adjusted_gross_income/amount
876+
0400000US11,DC,500000.0,inf,17097350000,0,adjusted_gross_income/amount
877877
0400000US10,DE,500000.0,inf,6773920000,0,adjusted_gross_income/amount
878878
0400000US12,FL,500000.0,inf,427887554000,0,adjusted_gross_income/amount
879879
0400000US13,GA,500000.0,inf,92080953000,0,adjusted_gross_income/amount
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
"""Refresh tracked SOI targets used by legacy local calibration.
2+
3+
This regenerates ``agi_state.csv`` from the IRS geographic SOI state file while
4+
preserving the legacy schema consumed by ``utils/loss.py``:
5+
6+
- ``GEO_NAME`` is the two-letter state abbreviation
7+
- ``VARIABLE`` is ``adjusted_gross_income/count`` or ``.../amount``
8+
- AGI bounds live in ``AGI_LOWER_BOUND`` / ``AGI_UPPER_BOUND``
9+
10+
This file intentionally remains separate from the national workbook-backed
11+
``soi_targets.csv`` refresh path because IRS geographic releases lag the
12+
national Publication 1304 tables.
13+
"""
14+
15+
from __future__ import annotations
16+
17+
import argparse
18+
from pathlib import Path
19+
20+
import numpy as np
21+
import pandas as pd
22+
23+
24+
CALIBRATION_FOLDER = Path(__file__).resolve().parent
25+
TARGETS_PATH = CALIBRATION_FOLDER / "agi_state.csv"
26+
STATE_SOI_TAX_YEAR = 2022
27+
LOCAL_STATE_SOI_TAX_YEAR = STATE_SOI_TAX_YEAR
28+
29+
AGI_STUB_TO_BAND = {
30+
1: "Under $1",
31+
2: "$1 under $10,000",
32+
3: "$10,000 under $25,000",
33+
4: "$25,000 under $50,000",
34+
5: "$50,000 under $75,000",
35+
6: "$75,000 under $100,000",
36+
7: "$100,000 under $200,000",
37+
8: "$200,000 under $500,000",
38+
9: "$500,000 or more",
39+
}
40+
41+
AGI_BOUNDS = {
42+
"Under $1": (-np.inf, 1),
43+
"$1 under $10,000": (1, 10_000),
44+
"$10,000 under $25,000": (10_000, 25_000),
45+
"$25,000 under $50,000": (25_000, 50_000),
46+
"$50,000 under $75,000": (50_000, 75_000),
47+
"$75,000 under $100,000": (75_000, 100_000),
48+
"$100,000 under $200,000": (100_000, 200_000),
49+
"$200,000 under $500,000": (200_000, 500_000),
50+
"$500,000 or more": (500_000, np.inf),
51+
}
52+
53+
STATE_ABBR_TO_FIPS = {
54+
"AL": "01",
55+
"AK": "02",
56+
"AZ": "04",
57+
"AR": "05",
58+
"CA": "06",
59+
"CO": "08",
60+
"CT": "09",
61+
"DE": "10",
62+
"DC": "11",
63+
"FL": "12",
64+
"GA": "13",
65+
"HI": "15",
66+
"ID": "16",
67+
"IL": "17",
68+
"IN": "18",
69+
"IA": "19",
70+
"KS": "20",
71+
"KY": "21",
72+
"LA": "22",
73+
"ME": "23",
74+
"MD": "24",
75+
"MA": "25",
76+
"MI": "26",
77+
"MN": "27",
78+
"MS": "28",
79+
"MO": "29",
80+
"MT": "30",
81+
"NE": "31",
82+
"NV": "32",
83+
"NH": "33",
84+
"NJ": "34",
85+
"NM": "35",
86+
"NY": "36",
87+
"NC": "37",
88+
"ND": "38",
89+
"OH": "39",
90+
"OK": "40",
91+
"OR": "41",
92+
"PA": "42",
93+
"RI": "44",
94+
"SC": "45",
95+
"SD": "46",
96+
"TN": "47",
97+
"TX": "48",
98+
"UT": "49",
99+
"VT": "50",
100+
"VA": "51",
101+
"WA": "53",
102+
"WV": "54",
103+
"WI": "55",
104+
"WY": "56",
105+
}
106+
107+
NON_VOTING_STATES = {"US", "AS", "GU", "MP", "PR", "VI", "OA"}
108+
VARIABLE_SPECS = (
109+
("N1", "adjusted_gross_income/count", True),
110+
("A00100", "adjusted_gross_income/amount", False),
111+
)
112+
113+
114+
def _state_soi_url(tax_year: int) -> str:
115+
return f"https://www.irs.gov/pub/irs-soi/{tax_year % 100:02d}in55cmcsv.csv"
116+
117+
118+
def _load_state_soi_raw(tax_year: int = LOCAL_STATE_SOI_TAX_YEAR) -> pd.DataFrame:
119+
return pd.read_csv(_state_soi_url(tax_year), thousands=",")
120+
121+
122+
def _base_state_frame(source_df: pd.DataFrame) -> pd.DataFrame:
123+
df = source_df.copy()
124+
merged_top_tail = (
125+
df[df["AGI_STUB"].isin([9, 10])]
126+
.groupby("STATE", as_index=False)
127+
.agg({"N1": "sum", "A00100": "sum"})
128+
.assign(AGI_STUB=9)
129+
)
130+
df = df[~df["AGI_STUB"].isin([9, 10])]
131+
df = pd.concat([df, merged_top_tail], ignore_index=True)
132+
df = df[df["AGI_STUB"] != 0].copy()
133+
df = df.loc[~df["STATE"].isin(NON_VOTING_STATES.union({"US"}))].copy()
134+
df["agi_bracket"] = df["AGI_STUB"].map(AGI_STUB_TO_BAND)
135+
df["GEO_NAME"] = df["STATE"]
136+
df["GEO_ID"] = "0400000US" + df["GEO_NAME"].map(STATE_ABBR_TO_FIPS)
137+
df["AGI_LOWER_BOUND"] = df["agi_bracket"].map(lambda band: AGI_BOUNDS[band][0])
138+
df["AGI_UPPER_BOUND"] = df["agi_bracket"].map(lambda band: AGI_BOUNDS[band][1])
139+
return df
140+
141+
142+
def build_local_agi_state_targets(
143+
source_df: pd.DataFrame | None = None,
144+
tax_year: int = LOCAL_STATE_SOI_TAX_YEAR,
145+
) -> pd.DataFrame:
146+
base = _base_state_frame(
147+
_load_state_soi_raw(tax_year=tax_year) if source_df is None else source_df
148+
)
149+
frames = []
150+
151+
for column, variable, is_count in VARIABLE_SPECS:
152+
frame = base[
153+
["GEO_ID", "GEO_NAME", "AGI_LOWER_BOUND", "AGI_UPPER_BOUND", column]
154+
].rename(columns={column: "VALUE"})
155+
frame["IS_COUNT"] = int(is_count)
156+
frame["VARIABLE"] = variable
157+
if not is_count:
158+
frame["VALUE"] = frame["VALUE"] * 1_000
159+
frames.append(frame)
160+
161+
return pd.concat(frames, ignore_index=True).reset_index(drop=True)
162+
163+
164+
def build_agi_state_targets(
165+
tax_year: int = LOCAL_STATE_SOI_TAX_YEAR,
166+
source_df: pd.DataFrame | None = None,
167+
) -> pd.DataFrame:
168+
return build_local_agi_state_targets(
169+
source_df=source_df,
170+
tax_year=tax_year,
171+
)
172+
173+
174+
def refresh_local_agi_state_targets(
175+
out_path: Path = TARGETS_PATH,
176+
) -> Path:
177+
targets = build_local_agi_state_targets()
178+
targets.to_csv(out_path, index=False)
179+
return out_path
180+
181+
182+
def refresh_agi_state_targets(
183+
tax_year: int = LOCAL_STATE_SOI_TAX_YEAR,
184+
out_path: Path = TARGETS_PATH,
185+
) -> pd.DataFrame:
186+
targets = build_local_agi_state_targets(tax_year=tax_year)
187+
targets.to_csv(out_path, index=False)
188+
return targets
189+
190+
191+
def main() -> None:
192+
parser = argparse.ArgumentParser(
193+
description="Refresh agi_state.csv for local calibration"
194+
)
195+
parser.add_argument(
196+
"--tax-year",
197+
type=int,
198+
default=LOCAL_STATE_SOI_TAX_YEAR,
199+
help="IRS geographic SOI tax year to pull",
200+
)
201+
parser.add_argument(
202+
"--out",
203+
type=Path,
204+
default=TARGETS_PATH,
205+
help="Output CSV path",
206+
)
207+
args = parser.parse_args()
208+
refresh_agi_state_targets(tax_year=args.tax_year, out_path=args.out)
209+
210+
211+
if __name__ == "__main__":
212+
main()

0 commit comments

Comments
 (0)