Add performance tests and an example notebook

Miruna Oprescu · Miruna Oprescu · commit 1070aea9648d · 2021-04-07T09:39:40.000-04:00
diff --git a/econml/dml/dynamic_dml.py b/econml/dml/dynamic_dml.py
@@ -482,7 +482,7 @@ def _gen_ortho_learner_model_final(self, n_periods):
         return _LinearDynamicModelFinal(wrapped_final_model, n_periods=n_periods)
 
     def _prefit(self, Y, T, *args, groups=None, only_final=False, **kwargs):
-        u_periods = np.unique(np.bincount(groups.astype(int)))
+        u_periods = np.unique(np.unique(groups, return_counts=True)[1])
         if len(u_periods) > 1:
             raise AttributeError(
                 "Imbalanced panel. Method currently expects only panels with equal number of periods. Pad your data")
diff --git a/econml/tests/dgp.py b/econml/tests/dgp.py
@@ -0,0 +1,156 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+import numpy as np
+from econml.utilities import cross_product
+from statsmodels.tools.tools import add_constant
+
+
+class _BaseDynamicPanelDGP:
+
+    def __init__(self, n_periods, n_treatments, n_x):
+        self.n_periods = n_periods
+        self.n_treatments = n_treatments
+        self.n_x = n_x
+        return
+
+    def create_instance(self, *args, **kwargs):
+        pass
+
+    def _gen_data_with_policy(self, n_units, policy_gen, random_seed=123):
+        pass
+
+    def static_policy_data(self, n_units, tau, random_seed=123):
+        def policy_gen(Tpre, X, period):
+            return tau[period]
+        return self._gen_data_with_policy(n_units, policy_gen, random_seed=random_seed)
+
+    def adaptive_policy_data(self, n_units, policy_gen, random_seed=123):
+        return self._gen_data_with_policy(n_units, policy_gen, random_seed=random_seed)
+
+    def static_policy_effect(self, tau, mc_samples=1000):
+        Y_tau, _, _, _ = self.static_policy_data(mc_samples, tau)
+        Y_zero, _, _, _ = self.static_policy_data(
+            mc_samples, np.zeros((self.n_periods, self.n_treatments)))
+        return np.mean(Y_tau[np.arange(Y_tau.shape[0]) % self.n_periods == self.n_periods - 1]) - \
+            np.mean(Y_zero[np.arange(Y_zero.shape[0]) %
+                           self.n_periods == self.n_periods - 1])
+
+    def adaptive_policy_effect(self, policy_gen, mc_samples=1000):
+        Y_tau, _, _, _ = self.adaptive_policy_data(mc_samples, policy_gen)
+        Y_zero, _, _, _ = self.static_policy_data(
+            mc_samples, np.zeros((self.n_periods, self.n_treatments)))
+        return np.mean(Y_tau[np.arange(Y_tau.shape[0]) % self.n_periods == self.n_periods - 1]) - \
+            np.mean(Y_zero[np.arange(Y_zero.shape[0]) %
+                           self.n_periods == self.n_periods - 1])
+
+
+class DynamicPanelDGP(_BaseDynamicPanelDGP):
+
+    def __init__(self, n_periods, n_treatments, n_x):
+        super().__init__(n_periods, n_treatments, n_x)
+
+    def create_instance(self, s_x, sigma_x=.5, sigma_y=.5, conf_str=5, hetero_strength=0, hetero_inds=None,
+                        autoreg=.5, state_effect=.5, random_seed=123):
+        np.random.seed(random_seed)
+        self.s_x = s_x
+        self.conf_str = conf_str
+        self.sigma_x = sigma_x
+        self.sigma_y = sigma_y
+        self.hetero_inds = hetero_inds.astype(
+            int) if hetero_inds is not None else hetero_inds
+        self.endo_inds = np.setdiff1d(
+            np.arange(self.n_x), hetero_inds).astype(int)
+        # The first s_x state variables are confounders. The final s_x variables are exogenous and can create
+        # heterogeneity
+        self.Alpha = np.random.uniform(-1, 1,
+                                       size=(self.n_x, self.n_treatments))
+        self.Alpha /= np.linalg.norm(self.Alpha, axis=1, ord=1, keepdims=True)
+        self.Alpha *= state_effect
+        if self.hetero_inds is not None:
+            self.Alpha[self.hetero_inds] = 0
+
+        self.Beta = np.zeros((self.n_x, self.n_x))
+        for t in range(self.n_x):
+            self.Beta[t, :] = autoreg * np.roll(np.random.uniform(low=4.0**(-np.arange(
+                0, self.n_x)), high=4.0**(-np.arange(1, self.n_x + 1))), t)
+        if self.hetero_inds is not None:
+            self.Beta[np.ix_(self.endo_inds, self.hetero_inds)] = 0
+            self.Beta[np.ix_(self.hetero_inds, self.endo_inds)] = 0
+
+        self.epsilon = np.random.uniform(-1, 1, size=self.n_treatments)
+        self.zeta = np.zeros(self.n_x)
+        self.zeta[:self.s_x] = self.conf_str / self.s_x
+
+        self.y_hetero_effect = np.zeros(self.n_x)
+        self.x_hetero_effect = np.zeros(self.n_x)
+        if self.hetero_inds is not None:
+            self.y_hetero_effect[self.hetero_inds] = np.random.uniform(.5 * hetero_strength, 1.5 * hetero_strength) / \
+                len(self.hetero_inds)
+            self.x_hetero_effect[self.hetero_inds] = np.random.uniform(.5 * hetero_strength, 1.5 * hetero_strength) / \
+                len(self.hetero_inds)
+
+        self.true_effect = np.zeros((self.n_periods, self.n_treatments))
+        self.true_effect[0] = self.epsilon
+        for t in np.arange(1, self.n_periods):
+            self.true_effect[t, :] = (self.zeta.reshape(
+                1, -1) @ np.linalg.matrix_power(self.Beta, t - 1) @ self.Alpha)
+
+        self.true_hetero_effect = np.zeros(
+            (self.n_periods, (self.n_x + 1) * self.n_treatments))
+        self.true_hetero_effect[0, :] = cross_product(
+            add_constant(self.y_hetero_effect.reshape(1, -1), has_constant='add'),
+            self.epsilon.reshape(1, -1))
+        for t in np.arange(1, self.n_periods):
+            self.true_hetero_effect[t, :] = cross_product(
+                add_constant(self.x_hetero_effect.reshape(1, -1), has_constant='add'),
+                self.zeta.reshape(1, -1) @ np.linalg.matrix_power(self.Beta, t - 1) @ self.Alpha)
+        return self
+
+    def hetero_effect_fn(self, t, x):
+        if t == 0:
+            return (np.dot(self.y_hetero_effect, x.flatten()) + 1) * self.epsilon
+        else:
+            return (np.dot(self.x_hetero_effect, x.flatten()) + 1) *\
+                (self.zeta.reshape(1, -1) @ np.linalg.matrix_power(self.Beta, t - 1)
+                    @ self.Alpha).flatten()
+
+    def _gen_data_with_policy(self, n_units, policy_gen, random_seed=123):
+        np.random.seed(random_seed)
+        Y = np.zeros(n_units * self.n_periods)
+        T = np.zeros((n_units * self.n_periods, self.n_treatments))
+        X = np.zeros((n_units * self.n_periods, self.n_x))
+        groups = np.zeros(n_units * self.n_periods)
+        for t in range(n_units * self.n_periods):
+            period = t % self.n_periods
+            if period == 0:
+                X[t] = np.random.normal(0, self.sigma_x, size=self.n_x)
+                T[t] = policy_gen(np.zeros(self.n_treatments), X[t], period)
+            else:
+                X[t] = (np.dot(self.x_hetero_effect, X[t - 1]) + 1) * np.dot(self.Alpha, T[t - 1]) + \
+                    np.dot(self.Beta, X[t - 1]) + \
+                    np.random.normal(0, self.sigma_x, size=self.n_x)
+                T[t] = policy_gen(T[t - 1], X[t], period)
+            Y[t] = (np.dot(self.y_hetero_effect, X[t]) + 1) * np.dot(self.epsilon, T[t]) + \
+                np.dot(X[t], self.zeta) + \
+                np.random.normal(0, self.sigma_y)
+            groups[t] = t // self.n_periods
+
+        return Y, T, X[:, self.hetero_inds] if self.hetero_inds else None, X[:, self.endo_inds], groups
+
+    def observational_data(self, n_units, gamma=0, s_t=1, sigma_t=0.5, random_seed=123):
+        """ Generated observational data with some observational treatment policy parameters
+
+        Parameters
+        ----------
+        n_units : how many units to observe
+        gamma : what is the degree of auto-correlation of the treatments across periods
+        s_t : sparsity of treatment policy; how many states does it depend on
+        sigma_t : what is the std of the exploration/randomness in the treatment
+        """
+        Delta = np.zeros((self.n_treatments, self.n_x))
+        Delta[:, :s_t] = self.conf_str / s_t
+
+        def policy_gen(Tpre, X, period):
+            return gamma * Tpre + (1 - gamma) * np.dot(Delta, X) + \
+                np.random.normal(0, sigma_t, size=self.n_treatments)
+        return self._gen_data_with_policy(n_units, policy_gen, random_seed=random_seed)
diff --git a/econml/tests/test_dynamic_dml.py b/econml/tests/test_dynamic_dml.py
@@ -12,6 +12,7 @@
 from econml.inference import BootstrapInference, EmpiricalInferenceResults, NormalInferenceResults
 from econml.utilities import shape, hstack, vstack, reshape, cross_product
 import econml.tests.utilities  # bugfix for assertWarns
+from econml.tests.dgp import DynamicPanelDGP
 
 
 class TestDynamicDML(unittest.TestCase):
@@ -79,7 +80,6 @@ def make_random(n, is_discrete, d):
                                     (d_y if d_y > 0 else 1) * (d_t_final if d_t_final > 0 else 1), 6)
 
                                 all_infs = [None, 'auto', BootstrapInference(2)]
-                                #all_infs = [None, 'auto']
                                 est = DynamicDML(model_y=Lasso() if d_y < 1 else MultiTaskLasso(),
                                                  model_t=LogisticRegression() if is_discrete else
                                                  (Lasso() if d_t < 1 else MultiTaskLasso()),
@@ -256,3 +256,40 @@ def make_random(n, is_discrete, d):
                                             eff = est.effect(X) if not is_discrete else est.effect(
                                                 X, T0='a', T1='b')
                                             self.assertEqual(shape(eff), effect_shape2)
+
+    def test_perf(self):
+        np.random.seed(123)
+        n_units = 400
+        n_periods = 3
+        n_treatments = 1
+        n_x = 100
+        s_x = 10
+        s_t = 10
+        hetero_strength = .5
+        hetero_inds = np.arange(n_x - n_treatments, n_x)
+        alpha_regs = [1e-4, 1e-3, 1e-2, 5e-2, .1, 1]
+
+        def lasso_model():
+            return LassoCV(cv=3, alphas=alpha_regs, max_iter=500)
+        # No heterogeneity
+        dgp = DynamicPanelDGP(n_periods, n_treatments, n_x).create_instance(
+            s_x, random_seed=1)
+        Y, T, X, W, groups = dgp.observational_data(n_units, s_t=s_t, random_seed=12)
+        est = DynamicDML(model_y=lasso_model(), model_t=lasso_model(), cv=3)
+        est.fit(Y, T, X=X, W=W, groups=groups, inference="auto")
+        np.testing.assert_allclose(est.intercept_, dgp.true_effect.flatten(), atol=1e-01)
+        np.testing.assert_array_less(est.intercept__interval()[0], dgp.true_effect.flatten())
+        np.testing.assert_array_less(dgp.true_effect.flatten(), est.intercept__interval()[1])
+        # Heterogeneous effects
+        hetero_strength = .5
+        hetero_inds = np.arange(n_x - n_treatments, n_x)
+        dgp = DynamicPanelDGP(n_periods, n_treatments, n_x).create_instance(
+            s_x, hetero_strength=hetero_strength, hetero_inds=hetero_inds, random_seed=1)
+        Y, T, X, W, groups = dgp.observational_data(n_units, s_t=s_t, random_seed=12)
+        est.fit(Y, T, X=X, W=W, groups=groups, inference="auto")
+        np.testing.assert_allclose(est.intercept_, dgp.true_effect.flatten(), atol=0.2)
+        np.testing.assert_allclose(est.coef_, dgp.true_hetero_effect[:, hetero_inds + 1], atol=0.2)
+        np.testing.assert_array_less(est.intercept__interval()[0], dgp.true_effect.flatten())
+        np.testing.assert_array_less(dgp.true_effect.flatten(), est.intercept__interval()[1])
+        np.testing.assert_array_less(est.coef__interval()[0], dgp.true_hetero_effect[:, hetero_inds + 1])
+        np.testing.assert_array_less(dgp.true_hetero_effect[:, hetero_inds + 1], est.coef__interval()[1])
diff --git a/notebooks/Dynamic Double Machine Learning Examples.ipynb b/notebooks/Dynamic Double Machine Learning Examples.ipynb