Skip to content

Commit b4d9eeb

Browse files
authored
Merge pull request #701 from PolicyEngine/draft/sstb-qbi-inputs
Add SSTB QBI split inputs to us-data
2 parents e1ef6a1 + 8158f7b commit b4d9eeb

19 files changed

Lines changed: 1279 additions & 56 deletions

changelog.d/701.changed.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Add SSTB QBI split inputs to `policyengine-us-data` by exposing
2+
`sstb_self_employment_income`, `sstb_w2_wages_from_qualified_business`, and
3+
`sstb_unadjusted_basis_qualified_property` from the existing PUF/calibration
4+
pipeline. The current split follows the legacy all-or-nothing
5+
`business_is_sstb` flag, so mixed SSTB/non-SSTB allocations remain approximate
6+
until more granular source data or imputation is added.

docs/appendix.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,12 +112,19 @@ for iteration in range(5000):
112112
- w2_wages_from_qualified_business
113113
- unadjusted_basis_qualified_property
114114
- business_is_sstb
115+
- sstb_self_employment_income
116+
- sstb_w2_wages_from_qualified_business
117+
- sstb_unadjusted_basis_qualified_property
115118
- qualified_reit_and_ptp_income
116119
- qualified_bdc_income
117120
- farm_operations_income
118121
- estate_income_would_be_qualified
119122
- farm_operations_income_would_be_qualified
120123
- farm_rent_income_would_be_qualified
124+
125+
The current PUF/calibration pipeline uses the legacy `business_is_sstb` flag to
126+
split these SSTB variables on an all-or-nothing basis. It does not yet infer
127+
mixed SSTB and non-SSTB allocations within the same record.
121128
- partnership_s_corp_income_would_be_qualified
122129
- rental_income_would_be_qualified
123130
- self_employment_income_would_be_qualified

policyengine_us_data/__init__.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
from importlib import import_module
22

33
from .geography import ZIP_CODE_DATASET
4+
from .utils.policyengine import ensure_policyengine_us_compat_variables
5+
6+
ensure_policyengine_us_compat_variables()
47

58
_LAZY_EXPORTS = {
69
"CPS_2024": (
@@ -26,7 +29,16 @@
2629

2730
def __getattr__(name: str):
2831
if name not in _LAZY_EXPORTS:
29-
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
32+
try:
33+
value = import_module(f"{__name__}.{name}")
34+
except ModuleNotFoundError as exc:
35+
if exc.name == f"{__name__}.{name}":
36+
raise AttributeError(
37+
f"module {__name__!r} has no attribute {name!r}"
38+
) from exc
39+
raise
40+
globals()[name] = value
41+
return value
3042

3143
module_name, attribute_name = _LAZY_EXPORTS[name]
3244
value = getattr(import_module(module_name), attribute_name)

policyengine_us_data/calibration/check_staging_sums.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
VARIABLES = [
2424
"adjusted_gross_income",
2525
"employment_income",
26-
"self_employment_income",
26+
"total_self_employment_income",
2727
"tax_unit_partnership_s_corp_income",
2828
"taxable_pension_income",
2929
"dividend_income",

policyengine_us_data/calibration/puf_impute.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,12 @@
5050
"pre_tax_contributions",
5151
"taxable_ira_distributions",
5252
"self_employment_income",
53+
"sstb_self_employment_income",
5354
"w2_wages_from_qualified_business",
5455
"unadjusted_basis_qualified_property",
5556
"business_is_sstb",
57+
"sstb_w2_wages_from_qualified_business",
58+
"sstb_unadjusted_basis_qualified_property",
5659
"short_term_capital_gains",
5760
"qualified_dividend_income",
5861
"charitable_cash_donations",
@@ -122,6 +125,8 @@
122125
"w2_wages_from_qualified_business",
123126
"unadjusted_basis_qualified_property",
124127
"business_is_sstb",
128+
"sstb_w2_wages_from_qualified_business",
129+
"sstb_unadjusted_basis_qualified_property",
125130
"charitable_cash_donations",
126131
"self_employed_pension_contribution_ald",
127132
"unrecaptured_section_1250_gain",
@@ -693,6 +698,11 @@ def _impute_retirement_contributions(
693698
X_test[income_var] = puf_imputations[income_var]
694699
else:
695700
X_test[income_var] = cps_sim.calculate(income_var).values
701+
if "sstb_self_employment_income" in puf_imputations:
702+
X_test["self_employment_income"] = (
703+
X_test["self_employment_income"]
704+
+ puf_imputations["sstb_self_employment_income"]
705+
)
696706

697707
del cps_sim
698708

@@ -723,13 +733,13 @@ def _impute_retirement_contributions(
723733
catch_up_eligible = age >= 50
724734
limit_401k = limits["401k"] + catch_up_eligible * limits["401k_catch_up"]
725735
limit_ira = limits["ira"] + catch_up_eligible * limits["ira_catch_up"]
736+
se_income = X_test["self_employment_income"].values
726737
se_pension_cap = np.minimum(
727-
X_test["self_employment_income"].values * limits["se_pension_rate"],
738+
se_income * limits["se_pension_rate"],
728739
limits["se_pension_dollar_limit"],
729740
)
730741

731742
emp_income = X_test["employment_income"].values
732-
se_income = X_test["self_employment_income"].values
733743

734744
result = {}
735745
for var in CPS_RETIREMENT_VARIABLES:

policyengine_us_data/calibration/target_config.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ include:
2222
geo_level: district
2323
- variable: real_estate_taxes
2424
geo_level: district
25-
- variable: self_employment_income
25+
- variable: total_self_employment_income
2626
geo_level: district
2727
- variable: taxable_pension_income
2828
geo_level: district
@@ -163,9 +163,9 @@ include:
163163
- variable: non_refundable_ctc
164164
geo_level: national
165165
domain_variable: adjusted_gross_income,non_refundable_ctc
166-
- variable: self_employment_income
166+
- variable: total_self_employment_income
167167
geo_level: national
168-
domain_variable: self_employment_income
168+
domain_variable: total_self_employment_income
169169
- variable: tax_unit_partnership_s_corp_income
170170
geo_level: national
171171
domain_variable: tax_unit_partnership_s_corp_income
@@ -199,7 +199,7 @@ include:
199199
# Restore old loss.py's self-employment return-count target.
200200
- variable: tax_unit_count
201201
geo_level: national
202-
domain_variable: self_employment_income
202+
domain_variable: total_self_employment_income
203203

204204
# === NATIONAL — identity / population count targets from old loss.py ===
205205
- variable: person_count

policyengine_us_data/calibration/validate_national_h5.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
VARIABLES = [
2626
"adjusted_gross_income",
2727
"employment_income",
28-
"self_employment_income",
28+
"total_self_employment_income",
2929
"tax_unit_partnership_s_corp_income",
3030
"taxable_pension_income",
3131
"dividend_income",

policyengine_us_data/datasets/puf/puf.py

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import h5py
12
import yaml
23
from importlib.resources import files
34

@@ -432,6 +433,20 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
432433
0.0
433434
)
434435
puf["business_is_sstb"] = rng.binomial(n=1, p=pr_sstb)
436+
is_sstb = puf["business_is_sstb"].astype(bool)
437+
438+
# The current PUF pipeline only imputes an all-or-nothing SSTB flag.
439+
# Use that to split Schedule C self-employment and allocable W-2/UBIA
440+
# inputs for policyengine-us without pretending to observe mixed cases.
441+
legacy_self_employment_income = puf["self_employment_income"].fillna(0)
442+
puf["sstb_self_employment_income"] = np.where(
443+
is_sstb, legacy_self_employment_income, 0.0
444+
)
445+
puf["self_employment_income"] = np.where(
446+
is_sstb, 0.0, legacy_self_employment_income
447+
)
448+
puf["sstb_w2_wages_from_qualified_business"] = np.where(is_sstb, w2, 0.0)
449+
puf["sstb_unadjusted_basis_qualified_property"] = np.where(is_sstb, ubia, 0.0)
435450

436451
reit_params = QBI_PARAMS["reit_ptp_income_distribution"]
437452
p_reit_ptp = reit_params["probability_of_receiving"]
@@ -526,6 +541,9 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
526541
"w2_wages_from_qualified_business",
527542
"unadjusted_basis_qualified_property",
528543
"business_is_sstb",
544+
"sstb_self_employment_income",
545+
"sstb_w2_wages_from_qualified_business",
546+
"sstb_unadjusted_basis_qualified_property",
529547
"deductible_mortgage_interest",
530548
"partnership_s_corp_income",
531549
"partnership_se_income",
@@ -538,6 +556,164 @@ class PUF(Dataset):
538556
time_period = None
539557
data_format = Dataset.ARRAYS
540558

559+
@staticmethod
560+
def _replace_array(file_handle, key: str, values: np.ndarray) -> None:
561+
if key in file_handle:
562+
del file_handle[key]
563+
file_handle.create_dataset(key, data=values)
564+
565+
def _sstb_split_overrides(self) -> dict[str, np.ndarray]:
566+
if not self.file_path.exists():
567+
return {}
568+
569+
with h5py.File(self.file_path, "r") as file_handle:
570+
if "business_is_sstb" not in file_handle:
571+
return {}
572+
keys = set(file_handle.keys())
573+
is_sstb = np.asarray(file_handle["business_is_sstb"]).astype(bool)
574+
overrides = {}
575+
if "self_employment_income" in keys:
576+
self_employment_income = np.asarray(
577+
file_handle["self_employment_income"]
578+
)
579+
existing_sstb_self_employment_income = (
580+
np.asarray(file_handle["sstb_self_employment_income"])
581+
if "sstb_self_employment_income" in keys
582+
else np.zeros_like(self_employment_income)
583+
)
584+
corrected_sstb_self_employment_income = np.where(
585+
is_sstb,
586+
np.where(
587+
existing_sstb_self_employment_income != 0,
588+
existing_sstb_self_employment_income,
589+
self_employment_income,
590+
),
591+
0.0,
592+
)
593+
corrected_self_employment_income = np.where(
594+
is_sstb, 0.0, self_employment_income
595+
)
596+
if (
597+
"sstb_self_employment_income" not in keys
598+
or not np.array_equal(
599+
existing_sstb_self_employment_income,
600+
corrected_sstb_self_employment_income,
601+
)
602+
or not np.array_equal(
603+
self_employment_income,
604+
corrected_self_employment_income,
605+
)
606+
):
607+
overrides["sstb_self_employment_income"] = (
608+
corrected_sstb_self_employment_income
609+
)
610+
overrides["self_employment_income"] = (
611+
corrected_self_employment_income
612+
)
613+
614+
for source_key, target_key in (
615+
(
616+
"w2_wages_from_qualified_business",
617+
"sstb_w2_wages_from_qualified_business",
618+
),
619+
(
620+
"unadjusted_basis_qualified_property",
621+
"sstb_unadjusted_basis_qualified_property",
622+
),
623+
):
624+
if source_key not in keys:
625+
continue
626+
corrected_target = np.where(
627+
is_sstb, np.asarray(file_handle[source_key]), 0.0
628+
)
629+
if target_key not in keys or not np.array_equal(
630+
np.asarray(file_handle[target_key]),
631+
corrected_target,
632+
):
633+
overrides[target_key] = corrected_target
634+
635+
return overrides
636+
637+
def _ensure_sstb_split_inputs(self) -> dict[str, np.ndarray]:
638+
overrides = self._sstb_split_overrides()
639+
if not overrides:
640+
return {}
641+
642+
try:
643+
with h5py.File(self.file_path, "r+") as file_handle:
644+
for key, values in overrides.items():
645+
self._replace_array(file_handle, key, values)
646+
except OSError:
647+
pass
648+
649+
return overrides
650+
651+
class _OverrideView:
652+
def __init__(self, backing, overrides: dict[str, np.ndarray]):
653+
self._backing = backing
654+
self._overrides = overrides
655+
656+
def __getitem__(self, key):
657+
if key in self._overrides:
658+
return self._overrides[key]
659+
return self._backing[key]
660+
661+
def __contains__(self, key):
662+
return key in self._overrides or key in self._backing
663+
664+
def keys(self):
665+
if hasattr(self._backing, "keys"):
666+
return tuple(dict.fromkeys((*self._backing.keys(), *self._overrides)))
667+
return tuple(self._overrides)
668+
669+
def get(self, key, default=None):
670+
if key in self:
671+
return self[key]
672+
return default
673+
674+
def items(self):
675+
for key in self.keys():
676+
yield key, self[key]
677+
678+
def values(self):
679+
for key in self.keys():
680+
yield self[key]
681+
682+
def __iter__(self):
683+
return iter(self.keys())
684+
685+
def close(self):
686+
if hasattr(self._backing, "close"):
687+
self._backing.close()
688+
689+
def __enter__(self):
690+
if hasattr(self._backing, "__enter__"):
691+
self._backing.__enter__()
692+
return self
693+
694+
def __exit__(self, exc_type, exc, traceback):
695+
if hasattr(self._backing, "__exit__"):
696+
return self._backing.__exit__(exc_type, exc, traceback)
697+
return None
698+
699+
def __getattr__(self, name):
700+
return getattr(self._backing, name)
701+
702+
def load(self, key=None, mode="r"):
703+
if mode == "r":
704+
overrides = self._ensure_sstb_split_inputs()
705+
if key in overrides:
706+
return overrides[key]
707+
if key is None and overrides:
708+
return self._OverrideView(super().load(key=key, mode=mode), overrides)
709+
return super().load(key=key, mode=mode)
710+
711+
def load_dataset(self):
712+
overrides = self._ensure_sstb_split_inputs()
713+
arrays = super().load_dataset()
714+
arrays.update(overrides)
715+
return arrays
716+
541717
def generate(self):
542718
from policyengine_us.system import system
543719

policyengine_us_data/db/etl_irs_soi.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
name="qualified_business_income_deduction",
6666
breakdown=None,
6767
),
68-
dict(code="00900", name="self_employment_income", breakdown=None),
68+
dict(code="00900", name="total_self_employment_income", breakdown=None),
6969
dict(
7070
code="01000",
7171
name="net_capital_gains",
@@ -147,7 +147,7 @@ def _skip_coarse_state_agi_person_count_target(geo_type: str, agi_stub: int) ->
147147
"net_capital_gains": "capital_gains_gross",
148148
"qualified_dividend_income": "qualified_dividends",
149149
"rental_income": "rent_and_royalty_net_income",
150-
"self_employment_income": "business_net_profits",
150+
"total_self_employment_income": "business_net_profits",
151151
"tax_exempt_interest_income": "exempt_interest",
152152
"tax_unit_partnership_s_corp_income": "partnership_and_s_corp_income",
153153
"taxable_interest_income": "taxable_interest_income",

0 commit comments

Comments
 (0)