From ac858cdc4b8e0b6a86d70e9db4980934912b6e26 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 2 Jun 2025 14:20:28 +0200 Subject: [PATCH 01/84] add a cross-sectional dgp --- doubleml/did/datasets/dgp_did_cs_CS2021.py | 190 +++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 doubleml/did/datasets/dgp_did_cs_CS2021.py diff --git a/doubleml/did/datasets/dgp_did_cs_CS2021.py b/doubleml/did/datasets/dgp_did_cs_CS2021.py new file mode 100644 index 00000000..95119b94 --- /dev/null +++ b/doubleml/did/datasets/dgp_did_cs_CS2021.py @@ -0,0 +1,190 @@ +import numpy as np + +from doubleml.did.datasets.dgp_did_CS2021 import make_did_CS2021 + +# Based on https://doi.org/10.1016/j.jeconom.2020.12.001 (see Appendix SC) +# and https://d2cml-ai.github.io/csdid/examples/csdid_basic.html#Examples-with-simulated-data +# Cross-sectional version of the data generating process (DGP) for Callaway and Sant'Anna (2021) + + +def make_did_cs_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, lambda_t=0.5, time_type="datetime", **kwargs): + """ + Generate synthetic repeated cross-sectional data for difference-in-differences analysis based on + Callaway and Sant'Anna (2021). + + This function creates repeated cross-sectional data with heterogeneous treatment effects across time periods and groups. + The data includes pre-treatment periods, multiple treatment groups that receive treatment at different times, + and optionally a never-treated group that serves as a control. The true average treatment effect on the + treated (ATT) has a heterogeneous structure dependent on covariates and exposure time. + + The data generating process offers six variations (``dgp_type`` 1-6) that differ in how the regression features + and propensity score features are derived: + + - DGP 1: Outcome and propensity score are linear (in Z) + - DGP 2: Outcome is linear, propensity score is nonlinear + - DGP 3: Outcome is nonlinear, propensity score is linear + - DGP 4: Outcome and propensity score are nonlinear + - DGP 5: Outcome is linear, propensity score is constant (experimental setting) + - DGP 6: Outcome is nonlinear, propensity score is constant (experimental setting) + + Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. + + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`, + :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`. + + For a feature vector :math:`W=(W_1, W_2, W_3, W_4)^T` (either X or Z based on ``dgp_type``), the core functions are: + + 1. Time-varying outcome regression function for each time period :math:`t`: + + .. math:: + + f_{reg,t}(W) = 210 + \\frac{t}{T} \\cdot (27.4 \\cdot W_1 + 13.7 \\cdot W_2 + 13.7 \\cdot W_3 + 13.7 \\cdot W_4) + + 2. Group-specific propensity function for each treatment group :math:`g`: + + .. math:: + + f_{ps,g}(W) = \\xi \\cdot \\left(1-\\frac{g}{G}\\right) \\cdot + (-W_1 + 0.5 \\cdot W_2 - 0.25 \\cdot W_3 - 0.2\\cdot W_4) + + where :math:`T` is the number of time periods, :math:`G` is the number of treatment groups, and :math:`\\xi` is a + scale parameter (default: 0.9). + + The panel data model is defined with the following components: + + 1. Time effects: :math:`\\delta_t = t` for time period :math:`t` + + 2. Individual effects: :math:`\\eta_i \\sim \\mathcal{N}(g_i, 1)` where :math:`g_i` is unit :math:`i`'s treatment group + + 3. Treatment effects: For a unit in treatment group :math:`g`, the effect in period :math:`t` is: + + .. math:: + + \\theta_{i,t,g} = \\max(t - t_g + 1, 0) + 0.1 \\cdot X_{i,1} \\cdot \\max(t - t_g + 1, 0) + + where :math:`t_g` is the first treatment period for group :math:`g`, :math:`X_{i,1}` is the first covariate for unit + :math:`i`, and :math:`\\max(t - t_g + 1, 0)` represents the exposure time (0 for pre-treatment periods). + + 4. Potential outcomes for unit :math:`i` in period :math:`t`: + + .. math:: + + Y_{i,t}(0) &= f_{reg,t}(W_{reg}) + \\delta_t + \\eta_i + \\varepsilon_{i,0,t} + + Y_{i,t}(1) &= Y_{i,t}(0) + \\theta_{i,t,g} + (\\varepsilon_{i,1,t} - \\varepsilon_{i,0,t}) + + where :math:`\\varepsilon_{i,0,t}, \\varepsilon_{i,1,t} \\sim \\mathcal{N}(0, 1)`. + + 5. Observed outcomes: + + .. math:: + + Y_{i,t} = Y_{i,t}(1) \\cdot 1\\{t \\geq t_g\\} + Y_{i,t}(0) \\cdot 1\\{t < t_g\\} + + 6. Treatment assignment: + + For non-experimental settings (DGP 1-4), the probability of being in treatment group :math:`g` is: + + .. math:: + + P(G_i = g) = \\frac{\\exp(f_{ps,g}(W_{ps}))}{\\sum_{g'} \\exp(f_{ps,g'}(W_{ps}))} + + For experimental settings (DGP 5-6), each treatment group (including never-treated) has equal probability: + + .. math:: + + P(G_i = g) = \\frac{1}{G} \\text{ for all } g + + 7. Steps 1-6 generate panel data. To obtain repeated cross-sectional data, the number of generated indivials is increased + to `n_obs/lambda_t`, where `lambda_t` denotes the pobability to observe a unit at each time period (time constant). + for each + + + The variables :math:`W_{reg}` and :math:`W_{ps}` are selected based on the DGP type: + + .. math:: + + DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z + + DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X + + DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z + + DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X + + DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0 + + DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0 + + where settings 5-6 correspond to experimental designs with equal probability across treatment groups. + + + Parameters + ---------- + n_obs : int, default=1000 + The number of observations to simulate. + + dgp_type : int, default=1 + The data generating process to be used (1-6). + + include_never_treated : bool, default=True + Whether to include units that are never treated. + + lambda_t : float, default=0.5 + Probability of observing a unit at each time period. + + time_type : str, default="datetime" + Type of time variable. Either "datetime" or "float". + + **kwargs + Additional keyword arguments. Accepts the following parameters: + + `c` (float, default=0.0): + Parameter for correlation structure in X. + + `dim_x` (int, default=4): + Dimension of feature vectors. + + `xi` (float, default=0.9): + Scale parameter for the propensity score function. + + `n_periods` (int, default=5): + Number of time periods. + + `anticipation_periods` (int, default=0): + Number of periods before treatment where anticipation effects occur. + + `n_pre_treat_periods` (int, default=2): + Number of pre-treatment periods. + + `start_date` (str, default="2025-01"): + Start date for datetime time variables. + + Returns + ------- + pandas.DataFrame + DataFrame containing the simulated panel data. + + References + ---------- + Callaway, B. and Sant’Anna, P. H. (2021), + Difference-in-Differences with multiple time periods. Journal of Econometrics, 225(2), 200-230. + doi:`10.1016/j.jeconom.2020.12.001 `_. + """ + + n_obs_panel = int(np.ceil(n_obs / lambda_t)) + df_panel = make_did_CS2021( + n_obs=n_obs_panel, + dgp_type=dgp_type, + include_never_treated=include_never_treated, + time_type=time_type, + **kwargs, + ) + + # for each time period, randomly select units to observe + observed_units = np.random.binomial(1, lambda_t, size=(len(df_panel.index))) + df_repeated_cs = df_panel[observed_units == 1].copy() + + return df_repeated_cs From 10e532e79600cced091cf471c729269ba7b7b983 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 2 Jun 2025 14:21:04 +0200 Subject: [PATCH 02/84] add simple test cases for cross sectional dgp --- doubleml/did/datasets/__init__.py | 2 ++ doubleml/did/tests/test_datasets.py | 54 ++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/doubleml/did/datasets/__init__.py b/doubleml/did/datasets/__init__.py index aaa5fc0a..306e7b10 100644 --- a/doubleml/did/datasets/__init__.py +++ b/doubleml/did/datasets/__init__.py @@ -3,9 +3,11 @@ """ from .dgp_did_CS2021 import make_did_CS2021 +from .dgp_did_cs_CS2021 import make_did_cs_CS2021 from .dgp_did_SZ2020 import make_did_SZ2020 __all__ = [ "make_did_SZ2020", "make_did_CS2021", + "make_did_cs_CS2021", ] diff --git a/doubleml/did/tests/test_datasets.py b/doubleml/did/tests/test_datasets.py index 0e323ec9..54eb4074 100644 --- a/doubleml/did/tests/test_datasets.py +++ b/doubleml/did/tests/test_datasets.py @@ -3,7 +3,7 @@ import pytest from doubleml import DoubleMLData -from doubleml.did.datasets import make_did_CS2021, make_did_SZ2020 +from doubleml.did.datasets import make_did_CS2021, make_did_cs_CS2021, make_did_SZ2020 msg_inv_return_type = "Invalid return_type." @@ -77,3 +77,55 @@ def test_make_did_CS2021_exceptions(): msg = r"time_type must be one of \('datetime', 'float'\). Got 2." with pytest.raises(ValueError, match=msg): _ = make_did_CS2021(n_obs=100, time_type=2) + + +@pytest.fixture(scope="function", params=[0.5, 0.1]) +def lambda_t(request): + return request.param + + +@pytest.mark.ci +def test_make_did_cs_CS2021_return_types(dgp_type, include_never_treated, lambda_t, time_type, anticipation_periods): + np.random.seed(3141) + df = make_did_cs_CS2021( + n_obs=100, + dgp_type=dgp_type, + include_never_treated=include_never_treated, + lambda_t=lambda_t, + time_type=time_type, + anticipation_periods=anticipation_periods, + ) + assert isinstance(df, pd.DataFrame) + + +@pytest.mark.ci +def test_panel_vs_cs_make_did_CS2021(dgp_type, include_never_treated, time_type, anticipation_periods): + np.random.seed(3141) + df_cs = make_did_cs_CS2021( + n_obs=100, + dgp_type=dgp_type, + include_never_treated=include_never_treated, + lambda_t=1.0, + time_type=time_type, + anticipation_periods=anticipation_periods, + ) + + np.random.seed(3141) + df_panel = make_did_CS2021( + n_obs=100, + dgp_type=dgp_type, + include_never_treated=include_never_treated, + time_type=time_type, + anticipation_periods=anticipation_periods, + ) + + # check if df_cs close to df_panel + assert df_cs.shape[0] == df_panel.shape[0] + # Select numerical columns + df_cs_numeric = df_cs.select_dtypes(include=np.number) + df_panel_numeric = df_panel.select_dtypes(include=np.number) + + # Ensure the same numerical columns are being compared, in the same order + pd.testing.assert_index_equal(df_cs_numeric.columns, df_panel_numeric.columns) + + assert np.allclose(df_cs_numeric.values, df_panel_numeric.values, atol=1e-5, rtol=1e-5) From c96605d28b6628c7d1bcf32c8fee0e0f8609b171 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Tue, 3 Jun 2025 13:28:48 +0200 Subject: [PATCH 03/84] reset index for in panel data --- doubleml/data/panel_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py index f548ae6a..4e416183 100644 --- a/doubleml/data/panel_data.py +++ b/doubleml/data/panel_data.py @@ -106,6 +106,8 @@ def __init__( force_all_x_finite=force_all_x_finite, force_all_d_finite=False, ) + # reset index to ensure a simple RangeIndex + self.data.reset_index(drop=True, inplace=True) if self.n_treat != 1: raise ValueError("Only one treatment column is allowed for panel data.") From 61dbf11470ca1f97be57196ac8c2b03e83ed94f6 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Tue, 3 Jun 2025 13:29:30 +0200 Subject: [PATCH 04/84] add basic did_cs_binary version with simple tests --- doubleml/did/__init__.py | 2 + doubleml/did/did_cs_binary.py | 592 ++++++++++++++++++ ...test_did_cs_binary_external_predictions.py | 92 +++ ...test_did_cs_binary_vs_did_cs_two_period.py | 163 +++++ 4 files changed, 849 insertions(+) create mode 100644 doubleml/did/did_cs_binary.py create mode 100644 doubleml/did/tests/test_did_cs_binary_external_predictions.py create mode 100644 doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py diff --git a/doubleml/did/__init__.py b/doubleml/did/__init__.py index 354ffaa5..369353ef 100644 --- a/doubleml/did/__init__.py +++ b/doubleml/did/__init__.py @@ -6,6 +6,7 @@ from .did_aggregation import DoubleMLDIDAggregation from .did_binary import DoubleMLDIDBinary from .did_cs import DoubleMLDIDCS +from .did_cs_binary import DoubleMLDIDCSBinary from .did_multi import DoubleMLDIDMulti __all__ = [ @@ -13,5 +14,6 @@ "DoubleMLDID", "DoubleMLDIDCS", "DoubleMLDIDBinary", + "DoubleMLDIDCSBinary", "DoubleMLDIDMulti", ] diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py new file mode 100644 index 00000000..ce57384c --- /dev/null +++ b/doubleml/did/did_cs_binary.py @@ -0,0 +1,592 @@ +import warnings + +import numpy as np +from sklearn.utils import check_X_y + +from doubleml.data.panel_data import DoubleMLPanelData +from doubleml.did.utils._did_utils import ( + _check_anticipation_periods, + _check_control_group, + _check_gt_combination, + _check_gt_values, + _get_id_positions, + _get_never_treated_value, + _is_never_treated, + _set_id_positions, +) +from doubleml.double_ml import DoubleML +from doubleml.double_ml_score_mixins import LinearScoreMixin +from doubleml.utils._checks import ( + _check_bool, + _check_finite_predictions, + _check_is_propensity, + _check_score, + _check_trimming, +) +from doubleml.utils._estimation import _dml_cv_predict, _get_cond_smpls_2d +from doubleml.utils._propensity_score import _trimm + + +class DoubleMLDIDCSBinary(LinearScoreMixin, DoubleML): + + def __init__( + self, + obj_dml_data, + g_value, + t_value_pre, + t_value_eval, + ml_g, + ml_m=None, + control_group="never_treated", + anticipation_periods=0, + n_folds=5, + n_rep=1, + score="observational", + in_sample_normalization=True, + trimming_rule="truncate", + trimming_threshold=1e-2, + draw_sample_splitting=True, + print_periods=False, + ): + super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting=False) + + self._check_data(self._dml_data) + g_values = self._dml_data.g_values + t_values = self._dml_data.t_values + + _check_bool(print_periods, "print_periods") + self._print_periods = print_periods + self._control_group = _check_control_group(control_group) + self._never_treated_value = _get_never_treated_value(g_values) + self._anticipation_periods = _check_anticipation_periods(anticipation_periods) + + _check_gt_combination( + (g_value, t_value_pre, t_value_eval), g_values, t_values, self.never_treated_value, self.anticipation_periods + ) + self._g_value = g_value + self._t_value_pre = t_value_pre + self._t_value_eval = t_value_eval + + # check if post_treatment evaluation + if g_value <= t_value_eval: + post_treatment = True + else: + post_treatment = False + + self._post_treatment = post_treatment + + if self._print_periods: + print( + f"Evaluation of ATT({g_value}, {t_value_eval}), with pre-treatment period {t_value_pre},\n" + + f"post-treatment: {post_treatment}. Control group: {control_group}.\n" + ) + + # Preprocess data + self._data_subset = self._preprocess_data(self._g_value, self._t_value_pre, self._t_value_eval) + + # Handling id values to match pairwise evaluation & simultaneous inference + if not np.all(np.isin(self.data_subset.index, self._dml_data.data.index)): + raise ValueError("The index values in the data subset are not a subset of the original index values.") + + # Find position of data subset in original data + # These entries should be replaced by nuisance predictions, all others should be set to 0. + self._id_positions = self.data_subset.index + + # Numeric values for positions of the entries in id_panel_data inside id_original + # np.nonzero(np.isin(id_original, id_panel_data)) + self._n_subset = self.data_subset.shape[0] + self._n_obs = self._n_subset # Effective sample size used for resampling + + # Save x and y for later ML estimation + self._x_data = self.data_subset.loc[:, self._dml_data.x_cols].values + self._y_data = self.data_subset.loc[:, self._dml_data.y_col].values + self._g_data = self.data_subset.loc[:, "G_indicator"].values + self._t_data = self.data_subset.loc[:, "t_indicator"].values + + valid_scores = ["observational", "experimental"] + _check_score(self.score, valid_scores, allow_callable=False) + + self._in_sample_normalization = in_sample_normalization + if not isinstance(self.in_sample_normalization, bool): + raise TypeError( + "in_sample_normalization indicator has to be boolean. " + + f"Object of type {str(type(self.in_sample_normalization))} passed." + ) + + # set stratication for resampling + self._strata = self.data_subset["G_indicator"] + 2 * self.data_subset["t_indicator"] + if draw_sample_splitting: + self.draw_sample_splitting() + + # check learners + ml_g_is_classifier = self._check_learner(ml_g, "ml_g", regressor=True, classifier=True) + if self.score == "observational": + _ = self._check_learner(ml_m, "ml_m", regressor=False, classifier=True) + self._learner = {"ml_g": ml_g, "ml_m": ml_m} + else: + assert self.score == "experimental" + if ml_m is not None: + warnings.warn( + ( + 'A learner ml_m has been provided for score = "experimental" but will be ignored. ' + "A learner ml_m is not required for estimation." + ) + ) + self._learner = {"ml_g": ml_g} + + if ml_g_is_classifier: + if obj_dml_data.binary_outcome: + self._predict_method = {"ml_g": "predict_proba"} + else: + raise ValueError( + f"The ml_g learner {str(ml_g)} was identified as classifier " + "but the outcome variable is not binary with values 0 and 1." + ) + else: + self._predict_method = {"ml_g": "predict"} + + if "ml_m" in self._learner: + self._predict_method["ml_m"] = "predict_proba" + self._initialize_ml_nuisance_params() + + self._trimming_rule = trimming_rule + self._trimming_threshold = trimming_threshold + _check_trimming(self._trimming_rule, self._trimming_threshold) + + self._sensitivity_implemented = False + self._external_predictions_implemented = True + + @property + def g_value(self): + """ + The value indicating the treatment group (first period with treatment). + """ + return self._g_value + + @property + def t_value_eval(self): + """ + The value indicating the evaluation period. + """ + return self._t_value_eval + + @property + def t_value_pre(self): + """ + The value indicating the pre-treatment period. + """ + return self._t_value_pre + + @property + def never_treated_value(self): + """ + The value indicating that a unit was never treated. + """ + return self._never_treated_value + + @property + def post_treatment(self): + """ + Indicates whether the evaluation period is after the treatment period. + """ + return self._post_treatment + + @property + def control_group(self): + """ + The control group. + """ + return self._control_group + + @property + def anticipation_periods(self): + """ + The number of anticipation periods. + """ + return self._anticipation_periods + + @property + def data_subset(self): + """ + The preprocessed data subset. + """ + return self._data_subset + + @property + def id_positions(self): + """ + The positions of the id values in the original data. + """ + return self._id_positions + + @property + def in_sample_normalization(self): + """ + Indicates whether the in sample normalization of weights are used. + """ + return self._in_sample_normalization + + @property + def trimming_rule(self): + """ + Specifies the used trimming rule. + """ + return self._trimming_rule + + @property + def trimming_threshold(self): + """ + Specifies the used trimming threshold. + """ + return self._trimming_threshold + + @property + def n_obs(self): + """ + The number of observations used for estimation. + """ + return self._n_subset + + def _initialize_ml_nuisance_params(self): + if self.score == "observational": + valid_learner = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1", "ml_m"] + else: + assert self.score == "experimental" + valid_learner = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1"] + self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in valid_learner} + + def _check_data(self, obj_dml_data): + if not isinstance(obj_dml_data, DoubleMLPanelData): + raise TypeError( + "For repeated outcomes the data must be of DoubleMLPanelData type. " + f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." + ) + if obj_dml_data.z_cols is not None: + raise NotImplementedError( + "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). " + "At the moment there are not DiD models with instruments implemented." + ) + + one_treat = obj_dml_data.n_treat == 1 + if not (one_treat): + raise ValueError( + "Incompatible data. " + "To fit an DID model with DML " + "exactly one variable needs to be specified as treatment variable." + ) + _check_gt_values(obj_dml_data.g_values, obj_dml_data.t_values) + return + + def _preprocess_data(self, g_value, pre_t, eval_t): + data = self._dml_data.data + + t_col = self._dml_data.t_col + id_col = self._dml_data.id_col + g_col = self._dml_data.g_col + + # relevant data subset + data_subset_indicator = data[t_col].isin([pre_t, eval_t]) + data_subset = data[data_subset_indicator].sort_values(by=[id_col, t_col]) + + # Construct G (treatment group) indicating treatment period in g + G_indicator = (data_subset[g_col] == g_value).astype(int) + + # Construct C (control group) indicating never treated or not yet treated + never_treated = _is_never_treated(data_subset[g_col], self.never_treated_value).reshape(-1) + if self.control_group == "never_treated": + C_indicator = never_treated.astype(int) + + elif self.control_group == "not_yet_treated": + # adjust max_g_value for anticipation periods + t_values = self._dml_data.t_values + max_g_value = t_values[min(np.where(t_values == eval_t)[0][0] + self.anticipation_periods, len(t_values) - 1)] + # not in G just as a additional check + later_treated = (data_subset[g_col] > max_g_value) & (G_indicator == 0) + not_yet_treated = never_treated | later_treated + C_indicator = not_yet_treated.astype(int) + + if np.sum(C_indicator) == 0: + raise ValueError("No observations in the control group.") + + data_subset = data_subset.assign(C_indicator=C_indicator, G_indicator=G_indicator) + # reduce to relevant subset + data_subset = data_subset[(data_subset["G_indicator"] == 1) | (data_subset["C_indicator"] == 1)] + # check if G and C are disjoint + assert sum(G_indicator & C_indicator) == 0 + + # add time indicator + data_subset = data_subset.assign(t_indicator=data_subset[t_col] == eval_t) + return data_subset + + def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): + + # Here: d is a binary treatment indicator + x, y = check_X_y(X=self._x_data, y=self._y_data, force_all_finite=False) + _, d = check_X_y(x, self._g_data, force_all_finite=False) # (d is the G_indicator) + _, t = check_X_y(x, self._t_data, force_all_finite=False) + + # THIS DIFFERS FROM THE PAPER due to stratified splitting this should be the same for each fold + # nuisance estimates of the uncond. treatment prob. + p_hat = np.full_like(d, d.mean(), dtype="float64") + lambda_hat = np.full_like(t, t.mean(), dtype="float64") + + # nuisance g + smpls_d0_t0, smpls_d0_t1, smpls_d1_t0, smpls_d1_t1 = _get_cond_smpls_2d(smpls, d, t) + + # nuisance g for d==0 & t==0 + if external_predictions["ml_g_d0_t0"] is not None: + ml_g_d0_t0_targets = np.full_like(y, np.nan, dtype="float64") + ml_g_d0_t0_targets[((d == 0) & (t == 0))] = y[((d == 0) & (t == 0))] + ml_d0_t0_pred = _get_id_positions(external_predictions["ml_g_d0_t0"], self.id_positions) + g_hat_d0_t0 = {"preds": ml_d0_t0_pred, "targets": ml_g_d0_t0_targets, "models": None} + else: + g_hat_d0_t0 = _dml_cv_predict( + self._learner["ml_g"], + x, + y, + smpls_d0_t0, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_g_d0_t0"), + method=self._predict_method["ml_g"], + return_models=return_models, + ) + + _check_finite_predictions(g_hat_d0_t0["preds"], self._learner["ml_g"], "ml_g", smpls) + # adjust target values to consider only compatible subsamples + g_hat_d0_t0["targets"] = g_hat_d0_t0["targets"].astype(float) + g_hat_d0_t0["targets"][np.invert((d == 0) & (t == 0))] = np.nan + + # nuisance g for d==0 & t==1 + if external_predictions["ml_g_d0_t1"] is not None: + ml_g_d0_t1_targets = np.full_like(y, np.nan, dtype="float64") + ml_g_d0_t1_targets[((d == 0) & (t == 1))] = y[((d == 0) & (t == 1))] + ml_d0_t1_pred = _get_id_positions(external_predictions["ml_g_d0_t1"], self.id_positions) + g_hat_d0_t1 = {"preds": ml_d0_t1_pred, "targets": ml_g_d0_t1_targets, "models": None} + else: + g_hat_d0_t1 = _dml_cv_predict( + self._learner["ml_g"], + x, + y, + smpls_d0_t1, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_g_d0_t1"), + method=self._predict_method["ml_g"], + return_models=return_models, + ) + + _check_finite_predictions(g_hat_d0_t1["preds"], self._learner["ml_g"], "ml_g", smpls) + # adjust target values to consider only compatible subsamples + g_hat_d0_t1["targets"] = g_hat_d0_t1["targets"].astype(float) + g_hat_d0_t1["targets"][np.invert((d == 0) & (t == 1))] = np.nan + + # nuisance g for d==1 & t==0 + if external_predictions["ml_g_d1_t0"] is not None: + ml_g_d1_t0_targets = np.full_like(y, np.nan, dtype="float64") + ml_g_d1_t0_targets[((d == 1) & (t == 0))] = y[((d == 1) & (t == 0))] + ml_d1_t0_pred = _get_id_positions(external_predictions["ml_g_d1_t0"], self.id_positions) + g_hat_d1_t0 = {"preds": ml_d1_t0_pred, "targets": ml_g_d1_t0_targets, "models": None} + else: + g_hat_d1_t0 = _dml_cv_predict( + self._learner["ml_g"], + x, + y, + smpls_d1_t0, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_g_d1_t0"), + method=self._predict_method["ml_g"], + return_models=return_models, + ) + + _check_finite_predictions(g_hat_d1_t0["preds"], self._learner["ml_g"], "ml_g", smpls) + # adjust target values to consider only compatible subsamples + g_hat_d1_t0["targets"] = g_hat_d1_t0["targets"].astype(float) + g_hat_d1_t0["targets"][np.invert((d == 1) & (t == 0))] = np.nan + + # nuisance g for d==1 & t==1 + if external_predictions["ml_g_d1_t1"] is not None: + ml_g_d1_t1_targets = np.full_like(y, np.nan, dtype="float64") + ml_g_d1_t1_targets[((d == 1) & (t == 1))] = y[((d == 1) & (t == 1))] + ml_d1_t1_pred = _get_id_positions(external_predictions["ml_g_d1_t1"], self.id_positions) + g_hat_d1_t1 = {"preds": ml_d1_t1_pred, "targets": ml_g_d1_t1_targets, "models": None} + else: + g_hat_d1_t1 = _dml_cv_predict( + self._learner["ml_g"], + x, + y, + smpls_d1_t1, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_g_d1_t1"), + method=self._predict_method["ml_g"], + return_models=return_models, + ) + + _check_finite_predictions(g_hat_d1_t1["preds"], self._learner["ml_g"], "ml_g", smpls) + # adjust target values to consider only compatible subsamples + g_hat_d1_t1["targets"] = g_hat_d1_t1["targets"].astype(float) + g_hat_d1_t1["targets"][np.invert((d == 1) & (t == 1))] = np.nan + + # only relevant for observational setting + m_hat = {"preds": None, "targets": None, "models": None} + if self.score == "observational": + # nuisance m + if external_predictions["ml_m"] is not None: + ml_m_pred = _get_id_positions(external_predictions["ml_m"], self.id_positions) + m_hat = {"preds": ml_m_pred, "targets": d, "models": None} + else: + m_hat = _dml_cv_predict( + self._learner["ml_m"], + x, + d, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_m"), + method=self._predict_method["ml_m"], + return_models=return_models, + ) + + _check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls) + _check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12) + m_hat["preds"] = _trimm(m_hat["preds"], self.trimming_rule, self.trimming_threshold) + + psi_a, psi_b = self._score_elements( + y, + d, + t, + g_hat_d0_t0["preds"], + g_hat_d0_t1["preds"], + g_hat_d1_t0["preds"], + g_hat_d1_t1["preds"], + m_hat["preds"], + p_hat, + lambda_hat, + ) + + extend_kwargs = { + "n_obs": self._dml_data.data.shape[0], + "id_positions": self.id_positions, + } + psi_elements = { + "psi_a": _set_id_positions(psi_a, fill_value=0.0, **extend_kwargs), + "psi_b": _set_id_positions(psi_b, fill_value=0.0, **extend_kwargs), + } + preds = { + "predictions": { + "ml_g_d0_t0": _set_id_positions(g_hat_d0_t0["preds"], fill_value=np.nan, **extend_kwargs), + "ml_g_d0_t1": _set_id_positions(g_hat_d0_t1["preds"], fill_value=np.nan, **extend_kwargs), + "ml_g_d1_t0": _set_id_positions(g_hat_d1_t0["preds"], fill_value=np.nan, **extend_kwargs), + "ml_g_d1_t1": _set_id_positions(g_hat_d1_t1["preds"], fill_value=np.nan, **extend_kwargs), + "ml_m": _set_id_positions(m_hat["preds"], fill_value=np.nan, **extend_kwargs), + }, + "targets": { + "ml_g_d0_t0": _set_id_positions(g_hat_d0_t0["targets"], fill_value=np.nan, **extend_kwargs), + "ml_g_d0_t1": _set_id_positions(g_hat_d0_t1["targets"], fill_value=np.nan, **extend_kwargs), + "ml_g_d1_t0": _set_id_positions(g_hat_d1_t0["targets"], fill_value=np.nan, **extend_kwargs), + "ml_g_d1_t1": _set_id_positions(g_hat_d1_t1["targets"], fill_value=np.nan, **extend_kwargs), + "ml_m": _set_id_positions(m_hat["targets"], fill_value=np.nan, **extend_kwargs), + }, + "models": { + "ml_g_d0_t0": g_hat_d0_t0["models"], + "ml_g_d0_t1": g_hat_d0_t1["models"], + "ml_g_d1_t0": g_hat_d1_t0["models"], + "ml_g_d1_t1": g_hat_d1_t1["models"], + "ml_m": m_hat["models"], + }, + } + + return psi_elements, preds + + def _score_elements(self, y, d, t, g_hat_d0_t0, g_hat_d0_t1, g_hat_d1_t0, g_hat_d1_t1, m_hat, p_hat, lambda_hat): + # calculate residuals + resid_d0_t0 = y - g_hat_d0_t0 + resid_d0_t1 = y - g_hat_d0_t1 + resid_d1_t0 = y - g_hat_d1_t0 + resid_d1_t1 = y - g_hat_d1_t1 + + d1t1 = np.multiply(d, t) + d1t0 = np.multiply(d, 1.0 - t) + d0t1 = np.multiply(1.0 - d, t) + d0t0 = np.multiply(1.0 - d, 1.0 - t) + + if self.score == "observational": + if self.in_sample_normalization: + weight_psi_a = np.divide(d, np.mean(d)) + weight_g_d1_t1 = weight_psi_a + weight_g_d1_t0 = -1.0 * weight_psi_a + weight_g_d0_t1 = -1.0 * weight_psi_a + weight_g_d0_t0 = weight_psi_a + + weight_resid_d1_t1 = np.divide(d1t1, np.mean(d1t1)) + weight_resid_d1_t0 = -1.0 * np.divide(d1t0, np.mean(d1t0)) + + prop_weighting = np.divide(m_hat, 1.0 - m_hat) + unscaled_d0_t1 = np.multiply(d0t1, prop_weighting) + weight_resid_d0_t1 = -1.0 * np.divide(unscaled_d0_t1, np.mean(unscaled_d0_t1)) + + unscaled_d0_t0 = np.multiply(d0t0, prop_weighting) + weight_resid_d0_t0 = np.divide(unscaled_d0_t0, np.mean(unscaled_d0_t0)) + else: + weight_psi_a = np.divide(d, p_hat) + weight_g_d1_t1 = weight_psi_a + weight_g_d1_t0 = -1.0 * weight_psi_a + weight_g_d0_t1 = -1.0 * weight_psi_a + weight_g_d0_t0 = weight_psi_a + + weight_resid_d1_t1 = np.divide(d1t1, np.multiply(p_hat, lambda_hat)) + weight_resid_d1_t0 = -1.0 * np.divide(d1t0, np.multiply(p_hat, 1.0 - lambda_hat)) + + prop_weighting = np.divide(m_hat, 1.0 - m_hat) + weight_resid_d0_t1 = -1.0 * np.multiply(np.divide(d0t1, np.multiply(p_hat, lambda_hat)), prop_weighting) + weight_resid_d0_t0 = np.multiply(np.divide(d0t0, np.multiply(p_hat, 1.0 - lambda_hat)), prop_weighting) + else: + assert self.score == "experimental" + if self.in_sample_normalization: + weight_psi_a = np.ones_like(y) + weight_g_d1_t1 = weight_psi_a + weight_g_d1_t0 = -1.0 * weight_psi_a + weight_g_d0_t1 = -1.0 * weight_psi_a + weight_g_d0_t0 = weight_psi_a + + weight_resid_d1_t1 = np.divide(d1t1, np.mean(d1t1)) + weight_resid_d1_t0 = -1.0 * np.divide(d1t0, np.mean(d1t0)) + weight_resid_d0_t1 = -1.0 * np.divide(d0t1, np.mean(d0t1)) + weight_resid_d0_t0 = np.divide(d0t0, np.mean(d0t0)) + else: + weight_psi_a = np.ones_like(y) + weight_g_d1_t1 = weight_psi_a + weight_g_d1_t0 = -1.0 * weight_psi_a + weight_g_d0_t1 = -1.0 * weight_psi_a + weight_g_d0_t0 = weight_psi_a + + weight_resid_d1_t1 = np.divide(d1t1, np.multiply(p_hat, lambda_hat)) + weight_resid_d1_t0 = -1.0 * np.divide(d1t0, np.multiply(p_hat, 1.0 - lambda_hat)) + weight_resid_d0_t1 = -1.0 * np.divide(d0t1, np.multiply(1.0 - p_hat, lambda_hat)) + weight_resid_d0_t0 = np.divide(d0t0, np.multiply(1.0 - p_hat, 1.0 - lambda_hat)) + + # set score elements + psi_a = -1.0 * weight_psi_a + + # psi_b + psi_b_1 = ( + np.multiply(weight_g_d1_t1, g_hat_d1_t1) + + np.multiply(weight_g_d1_t0, g_hat_d1_t0) + + np.multiply(weight_g_d0_t0, g_hat_d0_t0) + + np.multiply(weight_g_d0_t1, g_hat_d0_t1) + ) + psi_b_2 = ( + np.multiply(weight_resid_d1_t1, resid_d1_t1) + + np.multiply(weight_resid_d1_t0, resid_d1_t0) + + np.multiply(weight_resid_d0_t0, resid_d0_t0) + + np.multiply(weight_resid_d0_t1, resid_d0_t1) + ) + + psi_b = psi_b_1 + psi_b_2 + + return psi_a, psi_b + + def _nuisance_tuning( + self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search + ): + pass + + def _sensitivity_element_est(self, preds): + pass diff --git a/doubleml/did/tests/test_did_cs_binary_external_predictions.py b/doubleml/did/tests/test_did_cs_binary_external_predictions.py new file mode 100644 index 00000000..4e09dfe0 --- /dev/null +++ b/doubleml/did/tests/test_did_cs_binary_external_predictions.py @@ -0,0 +1,92 @@ +import math + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +from doubleml.did import DoubleMLDIDCSBinary +from doubleml.did.datasets import make_did_SZ2020 +from doubleml.tests._utils import draw_smpls +from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def did_score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_did_cs_fixture(did_score, n_rep): + n_obs = 500 + n_folds = 5 + + ext_predictions = {"d": {}} + dml_data = make_did_SZ2020(n_obs=n_obs, return_type="DoubleMLPanelData") + + kwargs = { + "obj_dml_data": dml_data, + "g_value": 1, + "t_value_pre": 0, + "t_value_eval": 1, + "score": did_score, + "n_rep": n_rep, + "draw_sample_splitting": False, + } + + dml_did = DoubleMLDIDCSBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs) + strata = dml_did.data_subset["G_indicator"] + 2 * dml_did.data_subset["t_indicator"] + all_smpls = draw_smpls(2 * n_obs, n_folds, n_rep=n_rep, groups=strata) + dml_did.set_sample_splitting(all_smpls) + + np.random.seed(3141) + dml_did.fit(store_predictions=True) + + all_keys = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1"] + for key in all_keys: + ext_predictions["d"][key] = dml_did.predictions[key][:, :, 0] + if did_score == "observational": + ext_predictions["d"]["ml_m"] = dml_did.predictions["ml_m"][:, :, 0] + + dml_did_ext = DoubleMLDIDCSBinary(ml_g=DMLDummyRegressor(), ml_m=DMLDummyClassifier(), **kwargs) + dml_did_ext.set_sample_splitting(all_smpls) + np.random.seed(3141) + dml_did_ext.fit(external_predictions=ext_predictions) + + res_dict = { + "coef": dml_did.coef[0], + "coef_ext": dml_did_ext.coef[0], + "se": dml_did.se[0], + "se_ext": dml_did_ext.se[0], + "score": dml_did.psi, + "score_ext": dml_did_ext.psi, + "dml_did_nuisance_loss": dml_did.nuisance_loss, + "dml_did_ext_nuisance_loss": dml_did_ext.nuisance_loss, + } + + return res_dict + + +@pytest.mark.ci +def test_coef(doubleml_did_cs_fixture): + assert math.isclose(doubleml_did_cs_fixture["coef"], doubleml_did_cs_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3) + + +@pytest.mark.ci +def test_se(doubleml_did_cs_fixture): + assert math.isclose(doubleml_did_cs_fixture["se"], doubleml_did_cs_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3) + + +@pytest.mark.ci +def test_score(doubleml_did_cs_fixture): + assert np.allclose(doubleml_did_cs_fixture["score"], doubleml_did_cs_fixture["score_ext"], rtol=1e-9, atol=1e-3) + + +@pytest.mark.ci +def test_nuisance_loss(doubleml_did_cs_fixture): + for key, value in doubleml_did_cs_fixture["dml_did_nuisance_loss"].items(): + assert np.allclose(value, doubleml_did_cs_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3) diff --git a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py new file mode 100644 index 00000000..2c8c34f3 --- /dev/null +++ b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py @@ -0,0 +1,163 @@ +import math + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml + +from ...tests._utils import draw_smpls +from ._utils_did_cs_manual import fit_did_cs +from ._utils_did_manual import boot_did + + +@pytest.fixture( + scope="module", + params=[ + [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)], + [ + RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42), + RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42), + ], + ], +) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def in_sample_normalization(request): + return request.param + + +@pytest.fixture(scope="module", params=[0.1]) +def trimming_threshold(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_did_cs_binary_vs_did_cs_fixture(generate_data_did_binary, learner, score, in_sample_normalization, trimming_threshold): + boot_methods = ["normal"] + n_folds = 2 + n_rep_boot = 499 + + # collect data + dml_panel_data = generate_data_did_binary + df = dml_panel_data._data.sort_values(by=["id", "t"]) + + n_obs = df.shape[0] + all_smpls = draw_smpls(n_obs, n_folds) + obj_dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) + + # Set machine learning methods for m & g + ml_g = clone(learner[0]) + ml_m = clone(learner[1]) + + dml_args = { + "ml_g": ml_g, + "ml_m": ml_m, + "n_folds": n_folds, + "score": score, + "in_sample_normalization": in_sample_normalization, + "trimming_threshold": trimming_threshold, + "draw_sample_splitting": False, + } + + dml_did_binary_obj = dml.did.DoubleMLDIDCSBinary( + dml_panel_data, + g_value=1, + t_value_pre=0, + t_value_eval=1, + **dml_args, + ) + + dml_did_obj = dml.DoubleMLDIDCS( + obj_dml_data, + **dml_args, + ) + + # synchronize the sample splitting + dml_did_obj.set_sample_splitting(all_smpls=all_smpls) + dml_did_binary_obj.set_sample_splitting(all_smpls=all_smpls) + + dml_did_obj.fit() + dml_did_binary_obj.fit() + + # manual fit + y = df["y"].values + d = df["d"].values + x = df[["Z1", "Z2", "Z3", "Z4"]].values + t = df["t"].values + + np.random.seed(3141) + res_manual = fit_did_cs( + y, + x, + d, + t, + clone(learner[0]), + clone(learner[1]), + all_smpls, + score, + in_sample_normalization, + trimming_threshold=trimming_threshold, + ) + + res_dict = { + "coef": dml_did_obj.coef, + "coef_binary": dml_did_binary_obj.coef, + "coef_manual": res_manual["theta"], + "se": dml_did_obj.se, + "se_binary": dml_did_binary_obj.se, + "se_manual": res_manual["se"], + "nuisance_loss": dml_did_obj.nuisance_loss, + "nuisance_loss_binary": dml_did_binary_obj.nuisance_loss, + "boot_methods": boot_methods, + } + + for bootstrap in boot_methods: + np.random.seed(3141) + boot_t_stat = boot_did( + y, + res_manual["thetas"], + res_manual["ses"], + res_manual["all_psi_a"], + res_manual["all_psi_b"], + all_smpls, + bootstrap, + n_rep_boot, + ) + + np.random.seed(3141) + dml_did_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + np.random.seed(3141) + dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + + res_dict["boot_t_stat" + bootstrap] = dml_did_obj.boot_t_stat + res_dict["boot_t_stat" + bootstrap + "_binary"] = dml_did_binary_obj.boot_t_stat + res_dict["boot_t_stat" + bootstrap + "_manual"] = boot_t_stat.reshape(-1, 1, 1) + + return res_dict + + +@pytest.mark.ci +def test_coefs(dml_did_cs_binary_vs_did_cs_fixture): + assert math.isclose( + dml_did_cs_binary_vs_did_cs_fixture["coef"][0], + dml_did_cs_binary_vs_did_cs_fixture["coef_manual"], + rel_tol=1e-9, + abs_tol=1e-4, + ) + assert math.isclose( + dml_did_cs_binary_vs_did_cs_fixture["coef_binary"][0], + dml_did_cs_binary_vs_did_cs_fixture["coef"][0], + rel_tol=1e-9, + abs_tol=1e-4, + ) From ceebc6ee5d462016f8ddaab3e8d0c2f9325665be Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Tue, 3 Jun 2025 14:19:28 +0200 Subject: [PATCH 05/84] add internal atribute _score_dim to DoubleML class --- doubleml/double_ml.py | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 764865a4..0ab80cfa 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -101,6 +101,7 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): if draw_sample_splitting: self.draw_sample_splitting() + self._score_dim = (self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs) # initialize arrays according to obj_dml_data and the resampling settings ( self._psi, @@ -1021,9 +1022,7 @@ def _initalize_fit(self, store_predictions, store_models): self._initialize_models() if self._sensitivity_implemented: - self._sensitivity_elements = self._initialize_sensitivity_elements( - (self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs) - ) + self._sensitivity_elements = self._initialize_sensitivity_elements(self._score_dim) def _fit_nuisance_and_score_elements(self, n_jobs_cv, store_predictions, external_predictions, store_models): ext_prediction_dict = _set_external_predictions( @@ -1076,30 +1075,26 @@ def _fit_sensitivity_elements(self, nuisance_predictions): def _initialize_arrays(self): # scores - psi = np.full((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs), np.nan) - psi_deriv = np.full((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs), np.nan) - psi_elements = self._initialize_score_elements((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs)) + psi = np.full(self._score_dim, np.nan) + psi_deriv = np.full(self._score_dim, np.nan) + psi_elements = self._initialize_score_elements(self._score_dim) - var_scaling_factors = np.full(self._dml_data.n_treat, np.nan) + n_rep = self._score_dim[1] + n_thetas = self._score_dim[2] + var_scaling_factors = np.full(n_thetas, np.nan) # coefficients and ses - coef = np.full(self._dml_data.n_coefs, np.nan) - se = np.full(self._dml_data.n_coefs, np.nan) + coef = np.full(n_thetas, np.nan) + se = np.full(n_thetas, np.nan) - all_coef = np.full((self._dml_data.n_coefs, self.n_rep), np.nan) - all_se = np.full((self._dml_data.n_coefs, self.n_rep), np.nan) + all_coef = np.full((n_thetas, n_rep), np.nan) + all_se = np.full((n_thetas, n_rep), np.nan) return psi, psi_deriv, psi_elements, var_scaling_factors, coef, se, all_coef, all_se def _initialize_predictions_and_targets(self): - self._predictions = { - learner: np.full((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs), np.nan) - for learner in self.params_names - } - self._nuisance_targets = { - learner: np.full((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs), np.nan) - for learner in self.params_names - } + self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names} + self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names} def _initialize_nuisance_loss(self): self._nuisance_loss = {learner: np.full((self.n_rep, self._dml_data.n_coefs), np.nan) for learner in self.params_names} From ade3b9a451bb0cb20367661773d9a00eb3f9968e Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Tue, 3 Jun 2025 14:25:12 +0200 Subject: [PATCH 06/84] check prediction size based on internal n_obs --- doubleml/double_ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 0ab80cfa..911487a3 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -1005,7 +1005,7 @@ def _check_fit(self, n_jobs_cv, store_predictions, external_predictions, store_m external_predictions=external_predictions, valid_treatments=self._dml_data.d_cols, valid_learners=self.params_names, - n_obs=self._dml_data.n_obs, + n_obs=self.n_obs, n_rep=self.n_rep, ) elif not self._external_predictions_implemented and external_predictions is not None: From f113e61e1375f807c039f20b94f777d73e0c6504 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Tue, 3 Jun 2025 14:26:28 +0200 Subject: [PATCH 07/84] update score dimensions init in the cs object --- doubleml/did/did_cs_binary.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index ce57384c..e550eb60 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -50,6 +50,20 @@ def __init__( ): super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting=False) + self._n_obs = obj_dml_data.data.shape[0] + self._score_dim = (self._n_obs, self.n_rep, self._dml_data.n_treat) + # reinitialze arrays + ( + self._psi, + self._psi_deriv, + self._psi_elements, + self._var_scaling_factors, + self._coef, + self._se, + self._all_coef, + self._all_se, + ) = self._initialize_arrays() + self._check_data(self._dml_data) g_values = self._dml_data.g_values t_values = self._dml_data.t_values From d65edf8b861dabaf5c4c1b4468303231b781fcc0 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 14:35:41 +0200 Subject: [PATCH 08/84] Refactor Data Generators #306 --- doubleml/datasets.py | 1620 ----------------- doubleml/datasets/__init__.py | 13 + doubleml/datasets/fetch_401K.py | 65 + doubleml/datasets/fetch_bonus.py | 98 + doubleml/irm/datasets/__init__.py | 20 + .../irm/datasets/dgp_confounded_irm_data.py | 232 +++ .../irm/datasets/dgp_heterogeneous_data.py | 114 ++ doubleml/irm/datasets/dgp_iivm_data.py | 102 ++ doubleml/irm/datasets/dgp_irm_data.py | 103 ++ .../dgp_irm_data_discrete_treatments.py | 164 ++ doubleml/irm/datasets/dgp_ssm_data.py | 102 ++ doubleml/plm/datasets/__init__.py | 20 + doubleml/plm/datasets/_make_pliv_data.py | 70 + .../plm/datasets/dgp_confounded_plr_data.py | 171 ++ doubleml/plm/datasets/dgp_pliv_CHS2015.py | 108 ++ .../dgp_pliv_multiway_cluster_CKMS2021.py | 199 ++ doubleml/plm/datasets/dgp_plr_CCDDHNR2018.py | 108 ++ doubleml/plm/datasets/dgp_plr_turrell2018.py | 107 ++ 18 files changed, 1796 insertions(+), 1620 deletions(-) delete mode 100644 doubleml/datasets.py create mode 100644 doubleml/datasets/__init__.py create mode 100644 doubleml/datasets/fetch_401K.py create mode 100644 doubleml/datasets/fetch_bonus.py create mode 100644 doubleml/irm/datasets/__init__.py create mode 100644 doubleml/irm/datasets/dgp_confounded_irm_data.py create mode 100644 doubleml/irm/datasets/dgp_heterogeneous_data.py create mode 100644 doubleml/irm/datasets/dgp_iivm_data.py create mode 100644 doubleml/irm/datasets/dgp_irm_data.py create mode 100644 doubleml/irm/datasets/dgp_irm_data_discrete_treatments.py create mode 100644 doubleml/irm/datasets/dgp_ssm_data.py create mode 100644 doubleml/plm/datasets/__init__.py create mode 100644 doubleml/plm/datasets/_make_pliv_data.py create mode 100644 doubleml/plm/datasets/dgp_confounded_plr_data.py create mode 100644 doubleml/plm/datasets/dgp_pliv_CHS2015.py create mode 100644 doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py create mode 100644 doubleml/plm/datasets/dgp_plr_CCDDHNR2018.py create mode 100644 doubleml/plm/datasets/dgp_plr_turrell2018.py diff --git a/doubleml/datasets.py b/doubleml/datasets.py deleted file mode 100644 index 0dcd33c7..00000000 --- a/doubleml/datasets.py +++ /dev/null @@ -1,1620 +0,0 @@ -import warnings - -import numpy as np -import pandas as pd -from scipy.linalg import toeplitz -from scipy.optimize import minimize_scalar -from sklearn.datasets import make_spd_matrix -from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures - -from doubleml.data import DoubleMLClusterData, DoubleMLData -from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_cluster_data_alias, _get_dml_data_alias - -_array_alias = _get_array_alias() -_data_frame_alias = _get_data_frame_alias() -_dml_data_alias = _get_dml_data_alias() -_dml_cluster_data_alias = _get_dml_cluster_data_alias() - - -def fetch_401K(return_type="DoubleMLData", polynomial_features=False): - """ - Data set on financial wealth and 401(k) plan participation. - - Parameters - ---------- - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - polynomial_features : - If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). - - References - ---------- - Abadie, A. (2003), Semiparametric instrumental variable estimation of treatment response models. Journal of - Econometrics, 113(2): 231-263. - - Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), - Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. - doi:`10.1111/ectj.12097 `_. - """ - url = "https://github.com/VC2015/DMLonGitHub/raw/master/sipp1991.dta" - raw_data = pd.read_stata(url) - - y_col = "net_tfa" - d_cols = ["e401"] - x_cols = ["age", "inc", "educ", "fsize", "marr", "twoearn", "db", "pira", "hown"] - - data = raw_data.copy() - - if polynomial_features: - raise NotImplementedError("polynomial_features os not implemented yet for fetch_401K.") - - if return_type in _data_frame_alias + _dml_data_alias: - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, y_col, d_cols, x_cols) - else: - raise ValueError("Invalid return_type.") - - -def fetch_bonus(return_type="DoubleMLData", polynomial_features=False): - """ - Data set on the Pennsylvania Reemployment Bonus experiment. - - Parameters - ---------- - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - polynomial_features : - If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). - - References - ---------- - Bilias Y. (2000), Sequential Testing of Duration Data: The Case of Pennsylvania 'Reemployment Bonus' Experiment. - Journal of Applied Econometrics, 15(6): 575-594. - - Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), - Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. - doi:`10.1111/ectj.12097 `_. - """ - url = "https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat" - raw_data = pd.read_csv(url, sep=r"\s+") - - ind = (raw_data["tg"] == 0) | (raw_data["tg"] == 4) - data = raw_data.copy()[ind] - data.reset_index(inplace=True) - data["tg"] = data["tg"].replace(4, 1) - data["inuidur1"] = np.log(data["inuidur1"]) - - # variable dep as factor (dummy encoding) - dummy_enc = OneHotEncoder(drop="first", categories="auto").fit(data.loc[:, ["dep"]]) - xx = dummy_enc.transform(data.loc[:, ["dep"]]).toarray() - data["dep1"] = xx[:, 0] - data["dep2"] = xx[:, 1] - - y_col = "inuidur1" - d_cols = ["tg"] - x_cols = [ - "female", - "black", - "othrace", - "dep1", - "dep2", - "q2", - "q3", - "q4", - "q5", - "q6", - "agelt35", - "agegt54", - "durable", - "lusd", - "husd", - ] - - if polynomial_features: - poly = PolynomialFeatures(2, include_bias=False) - data_transf = poly.fit_transform(data[x_cols]) - x_cols = list(poly.get_feature_names_out(x_cols)) - - data_transf = pd.DataFrame(data_transf, columns=x_cols) - data = pd.concat((data[[y_col] + d_cols], data_transf), axis=1, sort=False) - - if return_type in _data_frame_alias + _dml_data_alias: - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, y_col, d_cols, x_cols) - else: - raise ValueError("Invalid return_type.") - - -def _g(x): - return np.power(np.sin(x), 2) - - -def _m(x, nu=0.0, gamma=1.0): - return 0.5 / np.pi * (np.sinh(gamma)) / (np.cosh(gamma) - np.cos(x - nu)) - - -def make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=0.5, return_type="DoubleMLData", **kwargs): - """ - Generates data from a partially linear regression model used in Chernozhukov et al. (2018) for Figure 1. - The data generating process is defined as - - .. math:: - - d_i &= m_0(x_i) + s_1 v_i, & &v_i \\sim \\mathcal{N}(0,1), - - y_i &= \\alpha d_i + g_0(x_i) + s_2 \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), - - - with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = 0.7^{|j-k|}`. - The nuisance functions are given by - - .. math:: - - m_0(x_i) &= a_0 x_{i,1} + a_1 \\frac{\\exp(x_{i,3})}{1+\\exp(x_{i,3})}, - - g_0(x_i) &= b_0 \\frac{\\exp(x_{i,1})}{1+\\exp(x_{i,1})} + b_1 x_{i,3}. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - alpha : - The value of the causal parameter. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. - **kwargs - Additional keyword arguments to set non-default values for the parameters - :math:`a_0=1`, :math:`a_1=0.25`, :math:`s_1=1`, :math:`b_0=1`, :math:`b_1=0.25` or :math:`s_2=1`. - - References - ---------- - Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), - Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. - doi:`10.1111/ectj.12097 `_. - """ - a_0 = kwargs.get("a_0", 1.0) - a_1 = kwargs.get("a_1", 0.25) - s_1 = kwargs.get("s_1", 1.0) - - b_0 = kwargs.get("b_0", 1.0) - b_1 = kwargs.get("b_1", 0.25) - s_2 = kwargs.get("s_2", 1.0) - - cov_mat = toeplitz([np.power(0.7, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - d = ( - a_0 * x[:, 0] - + a_1 * np.divide(np.exp(x[:, 2]), 1 + np.exp(x[:, 2])) - + s_1 - * np.random.standard_normal( - size=[ - n_obs, - ] - ) - ) - y = ( - alpha * d - + b_0 * np.divide(np.exp(x[:, 0]), 1 + np.exp(x[:, 0])) - + b_1 * x[:, 2] - + s_2 - * np.random.standard_normal( - size=[ - n_obs, - ] - ) - ) - - if return_type in _array_alias: - return x, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + ["y", "d"]) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", x_cols) - else: - raise ValueError("Invalid return_type.") - - -def make_plr_turrell2018(n_obs=100, dim_x=20, theta=0.5, return_type="DoubleMLData", **kwargs): - """ - Generates data from a partially linear regression model used in a blog article by Turrell (2018). - The data generating process is defined as - - .. math:: - - d_i &= m_0(x_i' b) + v_i, & &v_i \\sim \\mathcal{N}(0,1), - - y_i &= \\theta d_i + g_0(x_i' b) + u_i, & &u_i \\sim \\mathcal{N}(0,1), - - - with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a random symmetric, - positive-definite matrix generated with :py:meth:`sklearn.datasets.make_spd_matrix`. - :math:`b` is a vector with entries :math:`b_j=\\frac{1}{j}` and the nuisance functions are given by - - .. math:: - - m_0(x_i) &= \\frac{1}{2 \\pi} \\frac{\\sinh(\\gamma)}{\\cosh(\\gamma) - \\cos(x_i-\\nu)}, - - g_0(x_i) &= \\sin(x_i)^2. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. - **kwargs - Additional keyword arguments to set non-default values for the parameters - :math:`\\nu=0`, or :math:`\\gamma=1`. - - References - ---------- - Turrell, A. (2018), Econometrics in Python part I - Double machine learning, Markov Wanderer: A blog on economics, - science, coding and data. `https://aeturrell.com/blog/posts/econometrics-in-python-parti-ml/ - `_. - """ - nu = kwargs.get("nu", 0.0) - gamma = kwargs.get("gamma", 1.0) - - b = [1 / k for k in range(1, dim_x + 1)] - sigma = make_spd_matrix(dim_x) - - x = np.random.multivariate_normal( - np.zeros(dim_x), - sigma, - size=[ - n_obs, - ], - ) - G = _g(np.dot(x, b)) - M = _m(np.dot(x, b), nu=nu, gamma=gamma) - d = M + np.random.standard_normal( - size=[ - n_obs, - ] - ) - y = ( - np.dot(theta, d) - + G - + np.random.standard_normal( - size=[ - n_obs, - ] - ) - ) - - if return_type in _array_alias: - return x, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + ["y", "d"]) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", x_cols) - else: - raise ValueError("Invalid return_type.") - - -def make_irm_data(n_obs=500, dim_x=20, theta=0, R2_d=0.5, R2_y=0.5, return_type="DoubleMLData"): - """ - Generates data from a interactive regression (IRM) model. - The data generating process is defined as - - .. math:: - - d_i &= 1\\left\\lbrace \\frac{\\exp(c_d x_i' \\beta)}{1+\\exp(c_d x_i' \\beta)} > v_i \\right\\rbrace, & &v_i - \\sim \\mathcal{U}(0,1), - - y_i &= \\theta d_i + c_y x_i' \\beta d_i + \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), - - with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. - :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}` and the constants :math:`c_y` and - :math:`c_d` are given by - - .. math:: - - c_y = \\sqrt{\\frac{R_y^2}{(1-R_y^2) \\beta' \\Sigma \\beta}}, \\qquad c_d = - \\sqrt{\\frac{(\\pi^2 /3) R_d^2}{(1-R_d^2) \\beta' \\Sigma \\beta}}. - - The data generating process is inspired by a process used in the simulation experiment (see Appendix P) of Belloni - et al. (2017). - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - R2_d : - The value of the parameter :math:`R_d^2`. - R2_y : - The value of the parameter :math:`R_y^2`. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. - - References - ---------- - Belloni, A., Chernozhukov, V., Fernández‐Val, I. and Hansen, C. (2017). Program Evaluation and Causal Inference With - High‐Dimensional Data. Econometrica, 85: 233-298. - """ - # inspired by https://onlinelibrary.wiley.com/doi/abs/10.3982/ECTA12723, see suplement - v = np.random.uniform( - size=[ - n_obs, - ] - ) - zeta = np.random.standard_normal( - size=[ - n_obs, - ] - ) - - cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - beta = [1 / (k**2) for k in range(1, dim_x + 1)] - b_sigma_b = np.dot(np.dot(cov_mat, beta), beta) - c_y = np.sqrt(R2_y / ((1 - R2_y) * b_sigma_b)) - c_d = np.sqrt(np.pi**2 / 3.0 * R2_d / ((1 - R2_d) * b_sigma_b)) - - xx = np.exp(np.dot(x, np.multiply(beta, c_d))) - d = 1.0 * ((xx / (1 + xx)) > v) - - y = d * theta + d * np.dot(x, np.multiply(beta, c_y)) + zeta - - if return_type in _array_alias: - return x, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + ["y", "d"]) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", x_cols) - else: - raise ValueError("Invalid return_type.") - - -def make_iivm_data(n_obs=500, dim_x=20, theta=1.0, alpha_x=0.2, return_type="DoubleMLData"): - """ - Generates data from a interactive IV regression (IIVM) model. - The data generating process is defined as - - .. math:: - - d_i &= 1\\left\\lbrace \\alpha_x Z + v_i > 0 \\right\\rbrace, - - y_i &= \\theta d_i + x_i' \\beta + u_i, - - with :math:`Z \\sim \\text{Bernoulli}(0.5)` and - - .. math:: - - \\left(\\begin{matrix} u_i \\\\ v_i \\end{matrix} \\right) \\sim - \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.3 \\\\ 0.3 & 1 \\end{matrix} \\right) \\right). - - The covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`\\beta` is a `dim_x`-vector with entries - :math:`\\beta_j=\\frac{1}{j^2}`. - - The data generating process is inspired by a process used in the simulation experiment of Farbmacher, Gruber and - Klaassen (2020). - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - alpha_x : - The value of the parameter :math:`\\alpha_x`. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. - - References - ---------- - Farbmacher, H., Guber, R. and Klaaßen, S. (2020). Instrument Validity Tests with Causal Forests. MEA Discussion - Paper No. 13-2020. Available at SSRN: http://dx.doi.org/10.2139/ssrn.3619201. - """ - # inspired by https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3619201 - xx = np.random.multivariate_normal( - np.zeros(2), - np.array([[1.0, 0.3], [0.3, 1.0]]), - size=[ - n_obs, - ], - ) - u = xx[:, 0] - v = xx[:, 1] - - cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - beta = [1 / (k**2) for k in range(1, dim_x + 1)] - - z = np.random.binomial( - p=0.5, - n=1, - size=[ - n_obs, - ], - ) - d = 1.0 * (alpha_x * z + v > 0) - - y = d * theta + np.dot(x, beta) + u - - if return_type in _array_alias: - return x, y, d, z - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["y", "d", "z"]) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", x_cols, "z") - else: - raise ValueError("Invalid return_type.") - - -def _make_pliv_data(n_obs=100, dim_x=20, theta=0.5, gamma_z=0.4, return_type="DoubleMLData"): - b = [1 / k for k in range(1, dim_x + 1)] - sigma = make_spd_matrix(dim_x) - - x = np.random.multivariate_normal( - np.zeros(dim_x), - sigma, - size=[ - n_obs, - ], - ) - G = _g(np.dot(x, b)) - # instrument - z = _m(np.dot(x, b)) + np.random.standard_normal( - size=[ - n_obs, - ] - ) - # treatment - M = _m(gamma_z * z + np.dot(x, b)) - d = M + np.random.standard_normal( - size=[ - n_obs, - ] - ) - y = ( - np.dot(theta, d) - + G - + np.random.standard_normal( - size=[ - n_obs, - ] - ) - ) - - if return_type in _array_alias: - return x, y, d, z - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["y", "d", "z"]) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", x_cols, "z") - else: - raise ValueError("Invalid return_type.") - - -def make_pliv_CHS2015(n_obs, alpha=1.0, dim_x=200, dim_z=150, return_type="DoubleMLData"): - """ - Generates data from a partially linear IV regression model used in Chernozhukov, Hansen and Spindler (2015). - The data generating process is defined as - - .. math:: - - z_i &= \\Pi x_i + \\zeta_i, - - d_i &= x_i' \\gamma + z_i' \\delta + u_i, - - y_i &= \\alpha d_i + x_i' \\beta + \\varepsilon_i, - - with - - .. math:: - - \\left(\\begin{matrix} \\varepsilon_i \\\\ u_i \\\\ \\zeta_i \\\\ x_i \\end{matrix} \\right) \\sim - \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.6 & 0 & 0 \\\\ 0.6 & 1 & 0 & 0 \\\\ - 0 & 0 & 0.25 I_{p_n^z} & 0 \\\\ 0 & 0 & 0 & \\Sigma \\end{matrix} \\right) \\right) - - where :math:`\\Sigma` is a :math:`p_n^x \\times p_n^x` matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`I_{p_n^z}` is the :math:`p_n^z \\times p_n^z` identity matrix. - :math:`\\beta = \\gamma` is a :math:`p_n^x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}`, - :math:`\\delta` is a :math:`p_n^z`-vector with entries :math:`\\delta_j=\\frac{1}{j^2}` - and :math:`\\Pi = (I_{p_n^z}, 0_{p_n^z \\times (p_n^x - p_n^z)})`. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - alpha : - The value of the causal parameter. - dim_x : - The number of covariates. - dim_z : - The number of instruments. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. - - References - ---------- - Chernozhukov, V., Hansen, C. and Spindler, M. (2015), Post-Selection and Post-Regularization Inference in Linear - Models with Many Controls and Instruments. American Economic Review: Papers and Proceedings, 105 (5): 486-90. - """ - assert dim_x >= dim_z - # see https://assets.aeaweb.org/asset-server/articles-attachments/aer/app/10505/P2015_1022_app.pdf - xx = np.random.multivariate_normal( - np.zeros(2), - np.array([[1.0, 0.6], [0.6, 1.0]]), - size=[ - n_obs, - ], - ) - epsilon = xx[:, 0] - u = xx[:, 1] - - sigma = toeplitz([np.power(0.5, k) for k in range(0, dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - sigma, - size=[ - n_obs, - ], - ) - - I_z = np.eye(dim_z) - xi = np.random.multivariate_normal( - np.zeros(dim_z), - 0.25 * I_z, - size=[ - n_obs, - ], - ) - - beta = [1 / (k**2) for k in range(1, dim_x + 1)] - gamma = beta - delta = [1 / (k**2) for k in range(1, dim_z + 1)] - Pi = np.hstack((I_z, np.zeros((dim_z, dim_x - dim_z)))) - - z = np.dot(x, np.transpose(Pi)) + xi - d = np.dot(x, gamma) + np.dot(z, delta) + u - y = alpha * d + np.dot(x, beta) + epsilon - - if return_type in _array_alias: - return x, y, d, z - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - z_cols = [f"Z{i + 1}" for i in np.arange(dim_z)] - data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["y", "d"] + z_cols) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", x_cols, z_cols) - else: - raise ValueError("Invalid return_type.") - - -def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return_type="DoubleMLClusterData", **kwargs): - """ - Generates data from a partially linear IV regression model with multiway cluster sample used in Chiang et al. - (2021). The data generating process is defined as - - .. math:: - - Z_{ij} &= X_{ij}' \\xi_0 + V_{ij}, - - D_{ij} &= Z_{ij}' \\pi_{10} + X_{ij}' \\pi_{20} + v_{ij}, - - Y_{ij} &= D_{ij} \\theta + X_{ij}' \\zeta_0 + \\varepsilon_{ij}, - - with - - .. math:: - - X_{ij} &= (1 - \\omega_1^X - \\omega_2^X) \\alpha_{ij}^X - + \\omega_1^X \\alpha_{i}^X + \\omega_2^X \\alpha_{j}^X, - - \\varepsilon_{ij} &= (1 - \\omega_1^\\varepsilon - \\omega_2^\\varepsilon) \\alpha_{ij}^\\varepsilon - + \\omega_1^\\varepsilon \\alpha_{i}^\\varepsilon + \\omega_2^\\varepsilon \\alpha_{j}^\\varepsilon, - - v_{ij} &= (1 - \\omega_1^v - \\omega_2^v) \\alpha_{ij}^v - + \\omega_1^v \\alpha_{i}^v + \\omega_2^v \\alpha_{j}^v, - - V_{ij} &= (1 - \\omega_1^V - \\omega_2^V) \\alpha_{ij}^V - + \\omega_1^V \\alpha_{i}^V + \\omega_2^V \\alpha_{j}^V, - - and :math:`\\alpha_{ij}^X, \\alpha_{i}^X, \\alpha_{j}^X \\sim \\mathcal{N}(0, \\Sigma)` - where :math:`\\Sigma` is a :math:`p_x \\times p_x` matrix with entries - :math:`\\Sigma_{kj} = s_X^{|j-k|}`. - Further - - .. math:: - - \\left(\\begin{matrix} \\alpha_{ij}^\\varepsilon \\\\ \\alpha_{ij}^v \\end{matrix}\\right), - \\left(\\begin{matrix} \\alpha_{i}^\\varepsilon \\\\ \\alpha_{i}^v \\end{matrix}\\right), - \\left(\\begin{matrix} \\alpha_{j}^\\varepsilon \\\\ \\alpha_{j}^v \\end{matrix}\\right) - \\sim \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & s_{\\varepsilon v} \\\\ - s_{\\varepsilon v} & 1 \\end{matrix} \\right) \\right) - - - and :math:`\\alpha_{ij}^V, \\alpha_{i}^V, \\alpha_{j}^V \\sim \\mathcal{N}(0, 1)`. - - Parameters - ---------- - N : - The number of observations (first dimension). - M : - The number of observations (second dimension). - dim_X : - The number of covariates. - theta : - The value of the causal parameter. - return_type : - If ``'DoubleMLClusterData'`` or ``DoubleMLClusterData``, returns a ``DoubleMLClusterData`` object where - ``DoubleMLClusterData.data`` is a ``pd.DataFrame``. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s - ``(x, y, d, cluster_vars, z)``. - **kwargs - Additional keyword arguments to set non-default values for the parameters - :math:`\\pi_{10}=1.0`, :math:`\\omega_X = \\omega_{\\varepsilon} = \\omega_V = \\omega_v = (0.25, 0.25)`, - :math:`s_X = s_{\\varepsilon v} = 0.25`, - or the :math:`p_x`-vectors :math:`\\zeta_0 = \\pi_{20} = \\xi_0` with default entries - :math:`(\\zeta_{0})_j = 0.5^j`. - - References - ---------- - Chiang, H. D., Kato K., Ma, Y. and Sasaki, Y. (2021), Multiway Cluster Robust Double/Debiased Machine Learning, - Journal of Business & Economic Statistics, - doi: `10.1080/07350015.2021.1895815 `_, - arXiv:`1909.03489 `_. - """ - # additional parameters specifiable via kwargs - pi_10 = kwargs.get("pi_10", 1.0) - - xx = np.arange(1, dim_X + 1) - zeta_0 = kwargs.get("zeta_0", np.power(0.5, xx)) - pi_20 = kwargs.get("pi_20", np.power(0.5, xx)) - xi_0 = kwargs.get("xi_0", np.power(0.5, xx)) - - omega_X = kwargs.get("omega_X", np.array([0.25, 0.25])) - omega_epsilon = kwargs.get("omega_epsilon", np.array([0.25, 0.25])) - omega_v = kwargs.get("omega_v", np.array([0.25, 0.25])) - omega_V = kwargs.get("omega_V", np.array([0.25, 0.25])) - - s_X = kwargs.get("s_X", 0.25) - s_epsilon_v = kwargs.get("s_epsilon_v", 0.25) - - # use np.tile() and np.repeat() for repeating vectors in different styles, i.e., - # np.tile([v1, v2, v3], 2) [v1, v2, v3, v1, v2, v3] - # np.repeat([v1, v2, v3], 2) [v1, v1, v2, v2, v3, v3] - - alpha_V = np.random.normal(size=(N * M)) - alpha_V_i = np.repeat(np.random.normal(size=N), M) - alpha_V_j = np.tile(np.random.normal(size=M), N) - - cov_mat = np.array([[1, s_epsilon_v], [s_epsilon_v, 1]]) - alpha_eps_v = np.random.multivariate_normal( - np.zeros(2), - cov_mat, - size=[ - N * M, - ], - ) - alpha_eps = alpha_eps_v[:, 0] - alpha_v = alpha_eps_v[:, 1] - - alpha_eps_v_i = np.random.multivariate_normal( - np.zeros(2), - cov_mat, - size=[ - N, - ], - ) - alpha_eps_i = np.repeat(alpha_eps_v_i[:, 0], M) - alpha_v_i = np.repeat(alpha_eps_v_i[:, 1], M) - - alpha_eps_v_j = np.random.multivariate_normal( - np.zeros(2), - cov_mat, - size=[ - M, - ], - ) - alpha_eps_j = np.tile(alpha_eps_v_j[:, 0], N) - alpha_v_j = np.tile(alpha_eps_v_j[:, 1], N) - - cov_mat = toeplitz([np.power(s_X, k) for k in range(dim_X)]) - alpha_X = np.random.multivariate_normal( - np.zeros(dim_X), - cov_mat, - size=[ - N * M, - ], - ) - alpha_X_i = np.repeat( - np.random.multivariate_normal( - np.zeros(dim_X), - cov_mat, - size=[ - N, - ], - ), - M, - axis=0, - ) - alpha_X_j = np.tile( - np.random.multivariate_normal( - np.zeros(dim_X), - cov_mat, - size=[ - M, - ], - ), - (N, 1), - ) - - # generate variables - x = (1 - omega_X[0] - omega_X[1]) * alpha_X + omega_X[0] * alpha_X_i + omega_X[1] * alpha_X_j - - eps = ( - (1 - omega_epsilon[0] - omega_epsilon[1]) * alpha_eps + omega_epsilon[0] * alpha_eps_i + omega_epsilon[1] * alpha_eps_j - ) - - v = (1 - omega_v[0] - omega_v[1]) * alpha_v + omega_v[0] * alpha_v_i + omega_v[1] * alpha_v_j - - V = (1 - omega_V[0] - omega_V[1]) * alpha_V + omega_V[0] * alpha_V_i + omega_V[1] * alpha_V_j - - z = np.matmul(x, xi_0) + V - d = z * pi_10 + np.matmul(x, pi_20) + v - y = d * theta + np.matmul(x, zeta_0) + eps - - cluster_cols = ["cluster_var_i", "cluster_var_j"] - cluster_vars = pd.MultiIndex.from_product([range(N), range(M)]).to_frame(name=cluster_cols).reset_index(drop=True) - - if return_type in _array_alias: - return x, y, d, cluster_vars.values, z - elif return_type in _data_frame_alias + _dml_cluster_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_X)] - data = pd.concat((cluster_vars, pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["Y", "D", "Z"])), axis=1) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLClusterData(data, "Y", "D", cluster_cols, x_cols, "Z") - else: - raise ValueError("Invalid return_type.") - - -def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs): - """ - Generates counfounded data from an interactive regression model. - - The data generating process is defined as follows (inspired by the Monte Carlo simulation used - in Sant'Anna and Zhao (2020)). - - Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds - to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where - - .. math:: - - \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) - - \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) - - \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 - - \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 - - \\tilde{Z}_5 &= X_5. - - Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. - At first, define the propensity score as - - .. math:: - - m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A - - where - - .. math:: - - p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))}, - - f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4). - - and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`. - Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as - - .. math:: - - P(D=1|X) = p(Z). - - Further, generate the outcome of interest :math:`Y` as - - .. math:: - - Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon - - g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4) - - where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. - This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of - the conditional expectation take the following forms - - .. math:: - - \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A - - \\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))}) - \\cdot D (Z_5 + 1) + g(Z). - - Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be - set via the parameters ``gamma_a`` and ``beta_a``. - - The observed data is given as :math:`W = (Y, D, Z)`. - Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, - the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and - in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE) - are returned in a dictionary. - - Parameters - ---------- - n_obs : int - The number of observations to simulate. - Default is ``500``. - theta : float or int - Average treatment effect. - Default is ``0.0``. - gamma_a : float - Coefficient of the unobserved confounder in the propensity score. - Default is ``0.127``. - beta_a : float - Coefficient of the unobserved confounder in the outcome regression. - Default is ``0.58``. - linear : bool - If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic. - Default is ``False``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. - - References - ---------- - Sant’Anna, P. H. and Zhao, J. (2020), - Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. - doi:`10.1016/j.jeconom.2020.06.003 `_. - """ - c = 0.0 # the confounding strength is only valid for c=0 - xi = 0.75 - dim_x = kwargs.get("dim_x", 5) - trimming_threshold = kwargs.get("trimming_threshold", 0.01) - var_eps_y = kwargs.get("var_eps_y", 1.0) - - # Specification of main regression function - def f_reg(w): - res = 2.5 + 0.74 * w[:, 0] + 0.25 * w[:, 1] + 0.137 * (w[:, 2] + w[:, 3]) - return res - - # Specification of prop score function - def f_ps(w, xi): - res = xi * (-w[:, 0] + 0.1 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) - return res - - # observed covariates - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - z_tilde_5 = x[:, 4] - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5)) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - # error terms and unobserved confounder - eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - # unobserved confounder - a_bounds = (-1, 1) - a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) - var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 - - # Choose the features used in the models - if linear: - features_ps = x - features_reg = x - else: - features_ps = z - features_reg = z - - p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) - # compute short and long form of propensity score - m_long = p + gamma_a * a - m_short = p - # check propensity score bounds - if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold): - m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold) - m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold) - warnings.warn( - f"Propensity score is close to 0 or 1. " - f"Trimming is at {trimming_threshold} and {1.0 - trimming_threshold} is applied" - ) - # generate treatment based on long form - u = np.random.uniform(low=0, high=1, size=n_obs) - d = 1.0 * (m_long >= u) - # add treatment heterogeneity - d1x = z[:, 4] + 1 - var_dx = np.var(d * (d1x)) - cov_adx = gamma_a * var_a - # Outcome regression - g_partial_reg = f_reg(features_reg) - # short model - g_short_d0 = g_partial_reg - g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg - g_short = d * g_short_d1 + (1.0 - d) * g_short_d0 - # long model - g_long_d0 = g_partial_reg + beta_a * a - g_long_d1 = theta * d1x + g_partial_reg + beta_a * a - g_long = d * g_long_d1 + (1.0 - d) * g_long_d0 - # Potential outcomes - y_0 = g_long_d0 + eps_y - y_1 = g_long_d1 + eps_y - # Realized outcome - y = d * y_1 + (1.0 - d) * y_0 - # In-sample values for confounding strength - explained_residual_variance = np.square(g_long - g_short) - residual_variance = np.square(y - g_short) - cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance) - # compute the Riesz representation - treated_weight = d / np.mean(d) - untreated_weight = (1.0 - d) / np.mean(d) - # Odds ratios - propensity_ratio_long = m_long / (1.0 - m_long) - rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long) - rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long) - propensity_ratio_short = m_short / (1.0 - m_short) - rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short) - rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short) - cf_d_ate = (np.mean(1 / (m_long * (1 - m_long))) - np.mean(1 / (m_short * (1 - m_short)))) / np.mean( - 1 / (m_long * (1 - m_long)) - ) - cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long) - if (beta_a == 0) | (gamma_a == 0): - rho_ate = 0.0 - rho_atte = 0.0 - else: - rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1] - rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1] - oracle_values = { - "g_long": g_long, - "g_short": g_short, - "m_long": m_long, - "m_short": m_short, - "gamma_a": gamma_a, - "beta_a": beta_a, - "a": a, - "y_0": y_0, - "y_1": y_1, - "z": z, - "cf_y": cf_y, - "cf_d_ate": cf_d_ate, - "cf_d_atte": cf_d_atte, - "rho_ate": rho_ate, - "rho_atte": rho_atte, - } - res_dict = {"x": x, "y": y, "d": d, "oracle_values": oracle_values} - return res_dict - - -def make_confounded_plr_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04, **kwargs): - """ - Generates counfounded data from an partially linear regression model. - - The data generating process is defined as follows (similar to the Monte Carlo simulation used - in Sant'Anna and Zhao (2020)). Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, - where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where - - .. math:: - - \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) - - \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) - - \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 - - \\tilde{Z}_4 &= (20 + X_2 + X_4)^2. - - Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. - At first, define the treatment as - - .. math:: - - D = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A + \\varepsilon_D - - and with :math:`\\varepsilon \\sim \\mathcal{N}(0,1)`. - Since :math:`A` is independent of :math:`X`, the long and short form of the treatment regression are given as - - .. math:: - - E[D|X,A] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A - - E[D|X] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4. - - Further, generate the outcome of interest :math:`Y` as - - .. math:: - - Y &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A + \\varepsilon - - g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4) - - where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. - This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of - the conditional expectation take the following forms - - .. math:: - - \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A - - \\mathbb{E}[Y|D, X] &= (\\theta + \\gamma_A\\beta_A \\frac{\\mathrm{Var}(A)}{\\mathrm{Var}(D)}) \\cdot D + g(Z). - - Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`. - Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample). - - The observed data is given as :math:`W = (Y, D, X)`. - Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, the effect :math:`\\theta`, - the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the long and short forms of the main regression and - the propensity score are returned in a dictionary. - - Parameters - ---------- - n_obs : int - The number of observations to simulate. - Default is ``500``. - theta : float or int - Average treatment effect. - Default is ``5.0``. - cf_y : float - Percentage of the residual variation of the outcome explained by latent/confounding variable. - Default is ``0.04``. - cf_d : float - Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable. - Default is ``0.04``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. - - References - ---------- - Sant’Anna, P. H. and Zhao, J. (2020), - Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. - doi:`10.1016/j.jeconom.2020.06.003 `_. - """ - c = kwargs.get("c", 0.0) - dim_x = kwargs.get("dim_x", 4) - - # observed covariates - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - - # error terms - var_eps_y = 5 - eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - var_eps_d = 1 - eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) - - # unobserved confounder - a_bounds = (-1, 1) - a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) - var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 - - # get the required impact of the confounder on the propensity score - m_short = -z[:, 0] + 0.5 * z[:, 1] - 0.25 * z[:, 2] - 0.1 * z[:, 3] - - def f_m(gamma_a): - rr_long = eps_d / var_eps_d - rr_short = (gamma_a * a + eps_d) / (gamma_a**2 * var_a + var_eps_d) - C2_D = (np.mean(np.square(rr_long)) - np.mean(np.square(rr_short))) / np.mean(np.square(rr_short)) - return np.square(C2_D / (1 + C2_D) - cf_d) - - gamma_a = minimize_scalar(f_m).x - m_long = m_short + gamma_a * a - d = m_long + eps_d - - # short and long version of g - g_partial_reg = 210 + 27.4 * z[:, 0] + 13.7 * (z[:, 1] + z[:, 2] + z[:, 3]) - - var_d = np.var(d) - - def f_g(beta_a): - g_diff = beta_a * (a - gamma_a * (var_a / var_d) * d) - y_diff = eps_y + g_diff - return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y) - - beta_a = minimize_scalar(f_g).x - - g_long = theta * d + g_partial_reg + beta_a * a - g_short = (theta + gamma_a * beta_a * var_a / var_d) * d + g_partial_reg - - y = g_long + eps_y - - oracle_values = { - "g_long": g_long, - "g_short": g_short, - "m_long": m_long, - "m_short": m_short, - "theta": theta, - "gamma_a": gamma_a, - "beta_a": beta_a, - "a": a, - "z": z, - } - - res_dict = {"x": x, "y": y, "d": d, "oracle_values": oracle_values} - - return res_dict - - -def make_heterogeneous_data(n_obs=200, p=30, support_size=5, n_x=1, binary_treatment=False): - """ - Creates a simple synthetic example for heterogeneous treatment effects. - The data generating process is based on the Monte Carlo simulation from Oprescu et al. (2019). - - The data is generated as - - .. math:: - - Y_i & = \\theta_0(X_i)D_i + \\langle X_i,\\gamma_0\\rangle + \\epsilon_i - - D_i & = \\langle X_i,\\beta_0\\rangle + \\eta_i, - - where :math:`X_i\\sim\\mathcal{U}[0,1]^{p}` and :math:`\\epsilon_i,\\eta_i - \\sim\\mathcal{U}[-1,1]`. - If the treatment is set to be binary, the treatment is generated as - - .. math:: - D_i = 1\\{\\langle X_i,\\beta_0\\rangle \\ge \\eta_i\\}. - - The coefficient vectors :math:`\\gamma_0` and :math:`\\beta_0` both have small random (identical) support - which values are drawn independently from :math:`\\mathcal{U}[0,1]` and :math:`\\mathcal{U}[0,0.3]`. - Further, :math:`\\theta_0(x)` defines the conditional treatment effect, which is defined differently depending - on the dimension of :math:`x`. - - If the heterogeneity is univariate the conditional treatment effect takes the following form - - .. math:: - \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_0), - - whereas for the two-dimensional case the conditional treatment effect is defined as - - .. math:: - \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_1). - - Parameters - ---------- - n_obs : int - Number of observations to simulate. - Default is ``200``. - - p : int - Dimension of covariates. - Default is ``30``. - - support_size : int - Number of relevant (confounding) covariates. - Default is ``5``. - - n_x : int - Dimension of the heterogeneity. Can be either ``1`` or ``2``. - Default is ``1``. - - binary_treatment : bool - Indicates whether the treatment is binary. - Default is ``False``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``data``, ``effects``, ``treatment_effect``. - - """ - # simple input checks - assert n_x in [1, 2], "n_x must be either 1 or 2." - assert support_size <= p, "support_size must be smaller than p." - assert isinstance(binary_treatment, bool), "binary_treatment must be a boolean." - - # define treatment effects - if n_x == 1: - - def treatment_effect(x): - return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0]) - - else: - assert n_x == 2 - - # redefine treatment effect - def treatment_effect(x): - return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 1]) - - # Outcome support and coefficients - support_y = np.random.choice(np.arange(p), size=support_size, replace=False) - coefs_y = np.random.uniform(0, 1, size=support_size) - # treatment support and coefficients - support_d = support_y - coefs_d = np.random.uniform(0, 0.3, size=support_size) - - # noise - epsilon = np.random.uniform(-1, 1, size=n_obs) - eta = np.random.uniform(-1, 1, size=n_obs) - - # Generate controls, covariates, treatments and outcomes - x = np.random.uniform(0, 1, size=(n_obs, p)) - # Heterogeneous treatment effects - te = treatment_effect(x) - if binary_treatment: - d = 1.0 * (np.dot(x[:, support_d], coefs_d) >= eta) - else: - d = np.dot(x[:, support_d], coefs_d) + eta - y = te * d + np.dot(x[:, support_y], coefs_y) + epsilon - - # Now we build the dataset - y_df = pd.DataFrame({"y": y}) - d_df = pd.DataFrame({"d": d}) - x_df = pd.DataFrame(data=x, index=np.arange(x.shape[0]), columns=[f"X_{i}" for i in range(x.shape[1])]) - - data = pd.concat([y_df, d_df, x_df], axis=1) - res_dict = {"data": data, "effects": te, "treatment_effect": treatment_effect} - return res_dict - - -def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type="DoubleMLData"): - """ - Generates data from a sample selection model (SSM). - The data generating process is defined as - - .. math:: - - y_i &= \\theta d_i + x_i' \\beta d_i + u_i, - - s_i &= 1\\left\\lbrace d_i + \\gamma z_i + x_i' \\beta + v_i > 0 \\right\\rbrace, - - d_i &= 1\\left\\lbrace x_i' \\beta + w_i > 0 \\right\\rbrace, - - with Y being observed if :math:`s_i = 1` and covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma^2_x)`, where - :math:`\\Sigma^2_x` is a matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. - :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{0.4}{j^2}` - :math:`z_i \\sim \\mathcal{N}(0, 1)`, - :math:`(u_i,v_i) \\sim \\mathcal{N}(0, \\Sigma^2_{u,v})`, - :math:`w_i \\sim \\mathcal{N}(0, 1)`. - - - The data generating process is inspired by a process used in the simulation study (see Appendix E) of Bia, - Huber and Lafférs (2023). - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - mar: - Boolean. Indicates whether missingness at random holds. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z, s)``. - - References - ---------- - Michela Bia, Martin Huber & Lukáš Lafférs (2023) Double Machine Learning for Sample Selection Models, - Journal of Business & Economic Statistics, DOI: 10.1080/07350015.2023.2271071 - """ - if mar: - sigma = np.array([[1, 0], [0, 1]]) - gamma = 0 - else: - sigma = np.array([[1, 0.8], [0.8, 1]]) - gamma = 1 - - e = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=n_obs).T - - cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - beta = [0.4 / (k**2) for k in range(1, dim_x + 1)] - - d = np.where(np.dot(x, beta) + np.random.randn(n_obs) > 0, 1, 0) - z = np.random.randn(n_obs) - s = np.where(np.dot(x, beta) + d + gamma * z + e[0] > 0, 1, 0) - - y = np.dot(x, beta) + theta * d + e[1] - y[s == 0] = 0 - - if return_type in _array_alias: - return x, y, d, z, s - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - if mar: - data = pd.DataFrame(np.column_stack((x, y, d, s)), columns=x_cols + ["y", "d", "s"]) - else: - data = pd.DataFrame(np.column_stack((x, y, d, z, s)), columns=x_cols + ["y", "d", "z", "s"]) - if return_type in _data_frame_alias: - return data - else: - if mar: - return DoubleMLData(data, "y", "d", x_cols, None, None, "s") - return DoubleMLData(data, "y", "d", x_cols, "z", None, "s") - else: - raise ValueError("Invalid return_type.") - - -def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, linear=False, random_state=None, **kwargs): - """ - Generates data from a interactive regression (IRM) model with multiple treatment levels (based on an - underlying continous treatment). - - The data generating process is defined as follows (similar to the Monte Carlo simulation used - in Sant'Anna and Zhao (2020)). - - Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds - to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where - - .. math:: - - \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) - - \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) - - \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 - - \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 - - \\tilde{Z}_5 &= X_5. - - A continuous treatment :math:`D_{\\text{cont}}` is generated as - - .. math:: - - D_{\\text{cont}} = \\xi (-Z_1 + 0.5 Z_2 - 0.25 Z_3 - 0.1 Z_4) + \\varepsilon_D, - - where :math:`\\varepsilon_D \\sim \\mathcal{N}(0,1)` and :math:`\\xi=0.3`. The corresponding treatment - effect is defined as - - .. math:: - - \\theta (d) = 0.1 \\exp(d) + 10 \\sin(0.7 d) + 2 d - 0.2 d^2. - - Based on the continous treatment, a discrete treatment :math:`D` is generated as with a baseline level of - :math:`D=0` and additional levels based on the quantiles of :math:`D_{\\text{cont}}`. The number of levels - is defined by :math:`n_{\\text{levels}}`. Each level is chosen to have the same probability of being selected. - - The potential outcomes are defined as - - .. math:: - - Y(0) &= 210 + 27.4 Z_1 + 13.7 (Z_2 + Z_3 + Z_4) + \\varepsilon_Y - - Y(1) &= \\theta (D_{\\text{cont}}) 1\\{D_{\\text{cont}} > 0\\} + Y(0), - - where :math:`\\varepsilon_Y \\sim \\mathcal{N}(0,5)`. Further, the observed outcome is defined as - - .. math:: - - Y = Y(1) 1\\{D > 0\\} + Y(0) 1\\{D = 0\\}. - - The data is returned as a dictionary with the entries ``x``, ``y``, ``d`` and ``oracle_values``. - - Parameters - ---------- - n_obs : int - The number of observations to simulate. - Default is ``200``. - - n_levels : int - The number of treatment levels. - Default is ``3``. - - linear : bool - Indicates whether the true underlying regression is linear. - Default is ``False``. - - random_state : int - Random seed for reproducibility. - Default is ``42``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. - The oracle values contain the continuous treatment, the level bounds, the potential level, ITE - and the potential outcome without treatment. - - """ - if random_state is not None: - np.random.seed(random_state) - xi = kwargs.get("xi", 0.3) - c = kwargs.get("c", 0.0) - dim_x = kwargs.get("dim_x", 5) - - if not isinstance(n_levels, int): - raise ValueError("n_levels must be an integer.") - if n_levels < 2: - raise ValueError("n_levels must be at least 2.") - - # observed covariates - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - def f_reg(w): - res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) - return res - - def f_treatment(w, xi): - res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) - return res - - def treatment_effect(d, scale=15): - return scale * (1 / (1 + np.exp(-d - 1.2 * np.cos(d)))) - 2 - - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - - # error terms - var_eps_y = 5 - eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - var_eps_d = 1 - eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) - - if linear: - g = f_reg(x) - m = f_treatment(x, xi) - else: - assert not linear - g = f_reg(z) - m = f_treatment(z, xi) - - cont_d = m + eps_d - level_bounds = np.quantile(cont_d, q=np.linspace(0, 1, n_levels + 1)) - potential_level = sum([1.0 * (cont_d >= bound) for bound in level_bounds[1:-1]]) + 1 - eta = np.random.uniform(0, 1, size=n_obs) - d = 1.0 * (eta >= 1 / n_levels) * potential_level - - ite = treatment_effect(cont_d) - y0 = g + eps_y - # only treated for d > 0 compared to the baseline - y = ite * (d > 0) + y0 - - oracle_values = { - "cont_d": cont_d, - "level_bounds": level_bounds, - "potential_level": potential_level, - "ite": ite, - "y0": y0, - } - - resul_dict = {"x": x, "y": y, "d": d, "oracle_values": oracle_values} - - return resul_dict diff --git a/doubleml/datasets/__init__.py b/doubleml/datasets/__init__.py new file mode 100644 index 00000000..6a64a5c8 --- /dev/null +++ b/doubleml/datasets/__init__.py @@ -0,0 +1,13 @@ +""" +The :mod:`doubleml.datasets` module implements data generating processes for double machine learning simulations and provides access to real datasets. +""" + +# Import fetch functions +from .fetch_401K import fetch_401K +from .fetch_bonus import fetch_bonus + + +__all__ = [ + "fetch_401K", + "fetch_bonus", +] diff --git a/doubleml/datasets/fetch_401K.py b/doubleml/datasets/fetch_401K.py new file mode 100644 index 00000000..05a97fe7 --- /dev/null +++ b/doubleml/datasets/fetch_401K.py @@ -0,0 +1,65 @@ +""" +Data set on financial wealth and 401(k) plan participation. +""" + +import pandas as pd +from doubleml import DoubleMLData + + +def _get_array_alias(): + return ["array", "np.array", "np.ndarray"] + + +def _get_data_frame_alias(): + return ["DataFrame", "pd.DataFrame", "pandas.DataFrame"] + + +def _get_dml_data_alias(): + return ["DoubleMLData"] + + +def fetch_401K(return_type="DoubleMLData", polynomial_features=False): + """ + Data set on financial wealth and 401(k) plan participation. + + Parameters + ---------- + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + polynomial_features : + If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). + + References + ---------- + Abadie, A. (2003), Semiparametric instrumental variable estimation of treatment response models. Journal of + Econometrics, 113(2): 231-263. + + Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), + Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. + doi:`10.1111/ectj.12097 `_. + """ + _array_alias = _get_array_alias() + _data_frame_alias = _get_data_frame_alias() + _dml_data_alias = _get_dml_data_alias() + + url = "https://github.com/VC2015/DMLonGitHub/raw/master/sipp1991.dta" + raw_data = pd.read_stata(url) + + y_col = "net_tfa" + d_cols = ["e401"] + x_cols = ["age", "inc", "educ", "fsize", "marr", "twoearn", "db", "pira", "hown"] + + data = raw_data.copy() + + if polynomial_features: + raise NotImplementedError("polynomial_features os not implemented yet for fetch_401K.") + + if return_type in _data_frame_alias + _dml_data_alias: + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, y_col, d_cols, x_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/datasets/fetch_bonus.py b/doubleml/datasets/fetch_bonus.py new file mode 100644 index 00000000..155100c3 --- /dev/null +++ b/doubleml/datasets/fetch_bonus.py @@ -0,0 +1,98 @@ +""" +Data set on the Pennsylvania Reemployment Bonus experiment. +""" + +import numpy as np +import pandas as pd +from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures +from doubleml import DoubleMLData + + +def _get_array_alias(): + return ["array", "np.array", "np.ndarray"] + + +def _get_data_frame_alias(): + return ["DataFrame", "pd.DataFrame", "pandas.DataFrame"] + + +def _get_dml_data_alias(): + return ["DoubleMLData"] + + +def fetch_bonus(return_type="DoubleMLData", polynomial_features=False): + """ + Data set on the Pennsylvania Reemployment Bonus experiment. + + Parameters + ---------- + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + polynomial_features : + If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). + + References + ---------- + Bilias Y. (2000), Sequential Testing of Duration Data: The Case of Pennsylvania 'Reemployment Bonus' Experiment. + Journal of Applied Econometrics, 15(6): 575-594. + + Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), + Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. + doi:`10.1111/ectj.12097 `_. + """ + _array_alias = _get_array_alias() + _data_frame_alias = _get_data_frame_alias() + _dml_data_alias = _get_dml_data_alias() + + url = "https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat" + raw_data = pd.read_csv(url, sep=r"\s+") + + ind = (raw_data["tg"] == 0) | (raw_data["tg"] == 4) + data = raw_data.copy()[ind] + data.reset_index(inplace=True) + data["tg"] = data["tg"].replace(4, 1) + data["inuidur1"] = np.log(data["inuidur1"]) + + # variable dep as factor (dummy encoding) + dummy_enc = OneHotEncoder(drop="first", categories="auto").fit(data.loc[:, ["dep"]]) + xx = dummy_enc.transform(data.loc[:, ["dep"]]).toarray() + data["dep1"] = xx[:, 0] + data["dep2"] = xx[:, 1] + + y_col = "inuidur1" + d_cols = ["tg"] + x_cols = [ + "female", + "black", + "othrace", + "dep1", + "dep2", + "q2", + "q3", + "q4", + "q5", + "q6", + "agelt35", + "agegt54", + "durable", + "lusd", + "husd", + ] + + if polynomial_features: + poly = PolynomialFeatures(2, include_bias=False) + data_transf = poly.fit_transform(data[x_cols]) + x_cols = list(poly.get_feature_names_out(x_cols)) + + data_transf = pd.DataFrame(data_transf, columns=x_cols) + data = pd.concat((data[[y_col] + d_cols], data_transf), axis=1, sort=False) + + if return_type in _data_frame_alias + _dml_data_alias: + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, y_col, d_cols, x_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/irm/datasets/__init__.py b/doubleml/irm/datasets/__init__.py new file mode 100644 index 00000000..05f95134 --- /dev/null +++ b/doubleml/irm/datasets/__init__.py @@ -0,0 +1,20 @@ +""" +The :mod:`doubleml.irm.datasets` module implements data generating processes for interactive regression models. +""" + +from .dgp_confounded_irm_data import make_confounded_irm_data +from .dgp_heterogeneous_data import make_heterogeneous_data +from .dgp_iivm_data import make_iivm_data +from .dgp_irm_data import make_irm_data +from .dgp_irm_data_discrete_treatments import make_irm_data_discrete_treatments +from .dgp_ssm_data import make_ssm_data + + +__all__ = [ + "make_confounded_irm_data", + "make_heterogeneous_data", + "make_iivm_data", + "make_irm_data", + "make_irm_data_discrete_treatments", + "make_ssm_data", +] diff --git a/doubleml/irm/datasets/dgp_confounded_irm_data.py b/doubleml/irm/datasets/dgp_confounded_irm_data.py new file mode 100644 index 00000000..2452e896 --- /dev/null +++ b/doubleml/irm/datasets/dgp_confounded_irm_data.py @@ -0,0 +1,232 @@ +import numpy as np +import warnings +from scipy.linalg import toeplitz + + +def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs): + """ + Generates counfounded data from an interactive regression model. + + The data generating process is defined as follows (inspired by the Monte Carlo simulation used + in Sant'Anna and Zhao (2020)). + + Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds + to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where + + .. math:: + + \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) + + \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) + + \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 + + \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 + + \\tilde{Z}_5 &= X_5. + + Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. + At first, define the propensity score as + + .. math:: + + m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A + + where + + .. math:: + + p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))}, + + f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4). + + and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`. + Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as + + .. math:: + + P(D=1|X) = p(Z). + + Further, generate the outcome of interest :math:`Y` as + + .. math:: + + Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon + + g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4) + + where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. + This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of + the conditional expectation take the following forms + + .. math:: + + \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + + \\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))}) + \\cdot D (Z_5 + 1) + g(Z). + + Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be + set via the parameters ``gamma_a`` and ``beta_a``. + + The observed data is given as :math:`W = (Y, D, Z)`. + Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, + the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and + in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE) + are returned in a dictionary. + + Parameters + ---------- + n_obs : int + The number of observations to simulate. + Default is ``500``. + theta : float or int + Average treatment effect. + Default is ``0.0``. + gamma_a : float + Coefficient of the unobserved confounder in the propensity score. + Default is ``0.127``. + beta_a : float + Coefficient of the unobserved confounder in the outcome regression. + Default is ``0.58``. + linear : bool + If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic. + Default is ``False``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. + + References + ---------- + Sant'Anna, P. H. and Zhao, J. (2020), + Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. + doi:`10.1016/j.jeconom.2020.06.003 `_. + """ + c = 0.0 # the confounding strength is only valid for c=0 + xi = 0.75 + dim_x = kwargs.get("dim_x", 5) + trimming_threshold = kwargs.get("trimming_threshold", 0.01) + var_eps_y = kwargs.get("var_eps_y", 1.0) + + # Specification of main regression function + def f_reg(w): + res = 2.5 + 0.74 * w[:, 0] + 0.25 * w[:, 1] + 0.137 * (w[:, 2] + w[:, 3]) + return res + + # Specification of prop score function + def f_ps(w, xi): + res = xi * (-w[:, 0] + 0.1 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) + return res + + # observed covariates + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + z_tilde_5 = x[:, 4] + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5)) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + # error terms and unobserved confounder + eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) + # unobserved confounder + a_bounds = (-1, 1) + a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) + var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 + + # Choose the features used in the models + if linear: + features_ps = x + features_reg = x + else: + features_ps = z + features_reg = z + + p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) + # compute short and long form of propensity score + m_long = p + gamma_a * a + m_short = p + # check propensity score bounds + if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold): + m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold) + m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold) + warnings.warn( + f"Propensity score is close to 0 or 1. " + f"Trimming is at {trimming_threshold} and {1.0 - trimming_threshold} is applied" + ) + # generate treatment based on long form + u = np.random.uniform(low=0, high=1, size=n_obs) + d = 1.0 * (m_long >= u) + # add treatment heterogeneity + d1x = z[:, 4] + 1 + var_dx = np.var(d * (d1x)) + cov_adx = gamma_a * var_a + # Outcome regression + g_partial_reg = f_reg(features_reg) + # short model + g_short_d0 = g_partial_reg + g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg + g_short = d * g_short_d1 + (1.0 - d) * g_short_d0 + # long model + g_long_d0 = g_partial_reg + beta_a * a + g_long_d1 = theta * d1x + g_partial_reg + beta_a * a + g_long = d * g_long_d1 + (1.0 - d) * g_long_d0 + # Potential outcomes + y_0 = g_long_d0 + eps_y + y_1 = g_long_d1 + eps_y + # Realized outcome + y = d * y_1 + (1.0 - d) * y_0 + # In-sample values for confounding strength + explained_residual_variance = np.square(g_long - g_short) + residual_variance = np.square(y - g_short) + cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance) + # compute the Riesz representation + treated_weight = d / np.mean(d) + untreated_weight = (1.0 - d) / np.mean(d) + # Odds ratios + propensity_ratio_long = m_long / (1.0 - m_long) + rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long) + rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long) + propensity_ratio_short = m_short / (1.0 - m_short) + rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short) + rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short) + cf_d_ate = (np.mean(1 / (m_long * (1 - m_long))) - np.mean(1 / (m_short * (1 - m_short)))) / np.mean( + 1 / (m_long * (1 - m_long)) + ) + cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long) + if (beta_a == 0) | (gamma_a == 0): + rho_ate = 0.0 + rho_atte = 0.0 + else: + rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1] + rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1] + oracle_values = { + "g_long": g_long, + "g_short": g_short, + "m_long": m_long, + "m_short": m_short, + "gamma_a": gamma_a, + "beta_a": beta_a, + "a": a, + "y_0": y_0, + "y_1": y_1, + "z": z, + "cf_y": cf_y, + "cf_d_ate": cf_d_ate, + "cf_d_atte": cf_d_atte, + "rho_ate": rho_ate, + "rho_atte": rho_atte, + } + res_dict = {"x": x, "y": y, "d": d, "oracle_values": oracle_values} + return res_dict diff --git a/doubleml/irm/datasets/dgp_heterogeneous_data.py b/doubleml/irm/datasets/dgp_heterogeneous_data.py new file mode 100644 index 00000000..0f1a1b15 --- /dev/null +++ b/doubleml/irm/datasets/dgp_heterogeneous_data.py @@ -0,0 +1,114 @@ +import numpy as np +import pandas as pd + + +def make_heterogeneous_data(n_obs=200, p=30, support_size=5, n_x=1, binary_treatment=False): + """ + Creates a simple synthetic example for heterogeneous treatment effects. + The data generating process is based on the Monte Carlo simulation from Oprescu et al. (2019). + + The data is generated as + + .. math:: + + Y_i & = \\theta_0(X_i)D_i + \\langle X_i,\\gamma_0\\rangle + \\epsilon_i + + D_i & = \\langle X_i,\\beta_0\\rangle + \\eta_i, + + where :math:`X_i\\sim\\mathcal{U}[0,1]^{p}` and :math:`\\epsilon_i,\\eta_i + \\sim\\mathcal{U}[-1,1]`. + If the treatment is set to be binary, the treatment is generated as + + .. math:: + D_i = 1\\{\\langle X_i,\\beta_0\\rangle \\ge \\eta_i\\}. + + The coefficient vectors :math:`\\gamma_0` and :math:`\\beta_0` both have small random (identical) support + which values are drawn independently from :math:`\\mathcal{U}[0,1]` and :math:`\\mathcal{U}[0,0.3]`. + Further, :math:`\\theta_0(x)` defines the conditional treatment effect, which is defined differently depending + on the dimension of :math:`x`. + + If the heterogeneity is univariate the conditional treatment effect takes the following form + + .. math:: + \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_0), + + whereas for the two-dimensional case the conditional treatment effect is defined as + + .. math:: + \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_1). + + Parameters + ---------- + n_obs : int + Number of observations to simulate. + Default is ``200``. + + p : int + Dimension of covariates. + Default is ``30``. + + support_size : int + Number of relevant (confounding) covariates. + Default is ``5``. + + n_x : int + Dimension of the heterogeneity. Can be either ``1`` or ``2``. + Default is ``1``. + + binary_treatment : bool + Indicates whether the treatment is binary. + Default is ``False``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``data``, ``effects``, ``treatment_effect``. + + """ + # simple input checks + assert n_x in [1, 2], "n_x must be either 1 or 2." + assert support_size <= p, "support_size must be smaller than p." + assert isinstance(binary_treatment, bool), "binary_treatment must be a boolean." + + # define treatment effects + if n_x == 1: + + def treatment_effect(x): + return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0]) + + else: + assert n_x == 2 + + # redefine treatment effect + def treatment_effect(x): + return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 1]) + + # Outcome support and coefficients + support_y = np.random.choice(np.arange(p), size=support_size, replace=False) + coefs_y = np.random.uniform(0, 1, size=support_size) + # treatment support and coefficients + support_d = support_y + coefs_d = np.random.uniform(0, 0.3, size=support_size) + + # noise + epsilon = np.random.uniform(-1, 1, size=n_obs) + eta = np.random.uniform(-1, 1, size=n_obs) + + # Generate controls, covariates, treatments and outcomes + x = np.random.uniform(0, 1, size=(n_obs, p)) + # Heterogeneous treatment effects + te = treatment_effect(x) + if binary_treatment: + d = 1.0 * (np.dot(x[:, support_d], coefs_d) >= eta) + else: + d = np.dot(x[:, support_d], coefs_d) + eta + y = te * d + np.dot(x[:, support_y], coefs_y) + epsilon + + # Now we build the dataset + y_df = pd.DataFrame({"y": y}) + d_df = pd.DataFrame({"d": d}) + x_df = pd.DataFrame(data=x, index=np.arange(x.shape[0]), columns=[f"X_{i}" for i in range(x.shape[1])]) + + data = pd.concat([y_df, d_df, x_df], axis=1) + res_dict = {"data": data, "effects": te, "treatment_effect": treatment_effect} + return res_dict diff --git a/doubleml/irm/datasets/dgp_iivm_data.py b/doubleml/irm/datasets/dgp_iivm_data.py new file mode 100644 index 00000000..e8c1130f --- /dev/null +++ b/doubleml/irm/datasets/dgp_iivm_data.py @@ -0,0 +1,102 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def make_iivm_data(n_obs=500, dim_x=20, theta=1.0, alpha_x=0.2, return_type="DoubleMLData"): + """ + Generates data from a interactive IV regression (IIVM) model. + The data generating process is defined as + + .. math:: + + d_i &= 1\\left\\lbrace \\alpha_x Z + v_i > 0 \\right\\rbrace, + + y_i &= \\theta d_i + x_i' \\beta + u_i, + + with :math:`Z \\sim \\text{Bernoulli}(0.5)` and + + .. math:: + + \\left(\\begin{matrix} u_i \\\\ v_i \\end{matrix} \\right) \\sim + \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.3 \\\\ 0.3 & 1 \\end{matrix} \\right) \\right). + + The covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`\\beta` is a `dim_x`-vector with entries + :math:`\\beta_j=\\frac{1}{j^2}`. + + The data generating process is inspired by a process used in the simulation experiment of Farbmacher, Gruber and + Klaassen (2020). + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + alpha_x : + The value of the parameter :math:`\\alpha_x`. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. + + References + ---------- + Farbmacher, H., Guber, R. and Klaaßen, S. (2020). Instrument Validity Tests with Causal Forests. MEA Discussion + Paper No. 13-2020. Available at SSRN: http://dx.doi.org/10.2139/ssrn.3619201. + """ + # inspired by https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3619201 + xx = np.random.multivariate_normal( + np.zeros(2), + np.array([[1.0, 0.3], [0.3, 1.0]]), + size=[ + n_obs, + ], + ) + u = xx[:, 0] + v = xx[:, 1] + + cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + + beta = [1 / (k**2) for k in range(1, dim_x + 1)] + + z = np.random.binomial( + p=0.5, + n=1, + size=[ + n_obs, + ], + ) + d = 1.0 * (alpha_x * z + v > 0) + y = d * theta + np.dot(x, beta) + u + + if return_type in _array_alias: + return x, y, d, z + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["y", "d", "z"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols, "z") + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/irm/datasets/dgp_irm_data.py b/doubleml/irm/datasets/dgp_irm_data.py new file mode 100644 index 00000000..973902ec --- /dev/null +++ b/doubleml/irm/datasets/dgp_irm_data.py @@ -0,0 +1,103 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def make_irm_data(n_obs=500, dim_x=20, theta=0, R2_d=0.5, R2_y=0.5, return_type="DoubleMLData"): + """ + Generates data from a interactive regression (IRM) model. + The data generating process is defined as + + .. math:: + + d_i &= 1\\left\\lbrace \\frac{\\exp(c_d x_i' \\beta)}{1+\\exp(c_d x_i' \\beta)} > v_i \\right\\rbrace, & &v_i + \\sim \\mathcal{U}(0,1), + + y_i &= \\theta d_i + c_y x_i' \\beta d_i + \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), + + with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. + :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}` and the constants :math:`c_y` and + :math:`c_d` are given by + + .. math:: + + c_y = \\sqrt{\\frac{R_y^2}{(1-R_y^2) \\beta' \\Sigma \\beta}}, \\qquad c_d = + \\sqrt{\\frac{(\\pi^2 /3) R_d^2}{(1-R_d^2) \\beta' \\Sigma \\beta}}. + + The data generating process is inspired by a process used in the simulation experiment (see Appendix P) of Belloni + et al. (2017). + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + R2_d : + The value of the parameter :math:`R_d^2`. + R2_y : + The value of the parameter :math:`R_y^2`. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. + + References + ---------- + Belloni, A., Chernozhukov, V., Fernández‐Val, I. and Hansen, C. (2017). Program Evaluation and Causal Inference With + High‐Dimensional Data. Econometrica, 85: 233-298. + """ + # inspired by https://onlinelibrary.wiley.com/doi/abs/10.3982/ECTA12723, see suplement + v = np.random.uniform( + size=[ + n_obs, + ] + ) + zeta = np.random.standard_normal( + size=[ + n_obs, + ] + ) + + cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + + beta = [1 / (k**2) for k in range(1, dim_x + 1)] + b_sigma_b = np.dot(np.dot(cov_mat, beta), beta) + c_y = np.sqrt(R2_y / ((1 - R2_y) * b_sigma_b)) + c_d = np.sqrt(np.pi**2 / 3.0 * R2_d / ((1 - R2_d) * b_sigma_b)) + + xx = np.exp(np.dot(x, np.multiply(beta, c_d))) + d = 1.0 * ((xx / (1 + xx)) > v) + + y = d * theta + d * np.dot(x, np.multiply(beta, c_y)) + zeta + + if return_type in _array_alias: + return x, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + ["y", "d"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/irm/datasets/dgp_irm_data_discrete_treatments.py b/doubleml/irm/datasets/dgp_irm_data_discrete_treatments.py new file mode 100644 index 00000000..af621c9d --- /dev/null +++ b/doubleml/irm/datasets/dgp_irm_data_discrete_treatments.py @@ -0,0 +1,164 @@ +import numpy as np +from scipy.linalg import toeplitz + + +def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, linear=False, random_state=None, **kwargs): + """ + Generates data from a interactive regression (IRM) model with multiple treatment levels (based on an + underlying continous treatment). + + The data generating process is defined as follows (similar to the Monte Carlo simulation used + in Sant'Anna and Zhao (2020)). + + Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds + to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where + + .. math:: + + \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) + + \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) + + \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 + + \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 + + \\tilde{Z}_5 &= X_5. + + A continuous treatment :math:`D_{\\text{cont}}` is generated as + + .. math:: + + D_{\\text{cont}} = \\xi (-Z_1 + 0.5 Z_2 - 0.25 Z_3 - 0.1 Z_4) + \\varepsilon_D, + + where :math:`\\varepsilon_D \\sim \\mathcal{N}(0,1)` and :math:`\\xi=0.3`. The corresponding treatment + effect is defined as + + .. math:: + + \\theta (d) = 0.1 \\exp(d) + 10 \\sin(0.7 d) + 2 d - 0.2 d^2. + + Based on the continous treatment, a discrete treatment :math:`D` is generated as with a baseline level of + :math:`D=0` and additional levels based on the quantiles of :math:`D_{\\text{cont}}`. The number of levels + is defined by :math:`n_{\\text{levels}}`. Each level is chosen to have the same probability of being selected. + + The potential outcomes are defined as + + .. math:: + + Y(0) &= 210 + 27.4 Z_1 + 13.7 (Z_2 + Z_3 + Z_4) + \\varepsilon_Y + + Y(1) &= \\theta (D_{\\text{cont}}) 1\\{D_{\\text{cont}} > 0\\} + Y(0), + + where :math:`\\varepsilon_Y \\sim \\mathcal{N}(0,5)`. Further, the observed outcome is defined as + + .. math:: + + Y = Y(1) 1\\{D > 0\\} + Y(0) 1\\{D = 0\\}. + + The data is returned as a dictionary with the entries ``x``, ``y``, ``d`` and ``oracle_values``. + + Parameters + ---------- + n_obs : int + The number of observations to simulate. + Default is ``200``. + + n_levels : int + The number of treatment levels. + Default is ``3``. + + linear : bool + Indicates whether the true underlying regression is linear. + Default is ``False``. + + random_state : int + Random seed for reproducibility. + Default is ``42``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. + The oracle values contain the continuous treatment, the level bounds, the potential level, ITE + and the potential outcome without treatment. + + """ + if random_state is not None: + np.random.seed(random_state) + xi = kwargs.get("xi", 0.3) + c = kwargs.get("c", 0.0) + dim_x = kwargs.get("dim_x", 5) + + if not isinstance(n_levels, int): + raise ValueError("n_levels must be an integer.") + if n_levels < 2: + raise ValueError("n_levels must be at least 2.") + + # observed covariates + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + + def f_reg(w): + res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) + return res + + def f_treatment(w, xi): + res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) + return res + + def treatment_effect(d, scale=15): + return scale * (1 / (1 + np.exp(-d - 1.2 * np.cos(d)))) - 2 + + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + + # error terms + var_eps_y = 5 + eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) + var_eps_d = 1 + eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) + + if linear: + g = f_reg(x) + m = f_treatment(x, xi) + else: + assert not linear + g = f_reg(z) + m = f_treatment(z, xi) + + cont_d = m + eps_d + level_bounds = np.quantile(cont_d, q=np.linspace(0, 1, n_levels + 1)) + potential_level = sum([1.0 * (cont_d >= bound) for bound in level_bounds[1:-1]]) + 1 + eta = np.random.uniform(0, 1, size=n_obs) + d = 1.0 * (eta >= 1 / n_levels) * potential_level + + ite = treatment_effect(cont_d) + y0 = g + eps_y + # only treated for d > 0 compared to the baseline + y = ite * (d > 0) + y0 + + oracle_values = { + "cont_d": cont_d, + "level_bounds": level_bounds, + "potential_level": potential_level, + "ite": ite, + "y0": y0, + } + + resul_dict = {"x": x, "y": y, "d": d, "oracle_values": oracle_values} + + return resul_dict diff --git a/doubleml/irm/datasets/dgp_ssm_data.py b/doubleml/irm/datasets/dgp_ssm_data.py new file mode 100644 index 00000000..6a6a5bee --- /dev/null +++ b/doubleml/irm/datasets/dgp_ssm_data.py @@ -0,0 +1,102 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type="DoubleMLData"): + """ + Generates data from a sample selection model (SSM). + The data generating process is defined as + + .. math:: + + y_i &= \\theta d_i + x_i' \\beta d_i + u_i, + + s_i &= 1\\left\\lbrace d_i + \\gamma z_i + x_i' \\beta + v_i > 0 \\right\\rbrace, + + d_i &= 1\\left\\lbrace x_i' \\beta + w_i > 0 \\right\\rbrace, + + with Y being observed if :math:`s_i = 1` and covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma^2_x)`, where + :math:`\\Sigma^2_x` is a matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. + :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{0.4}{j^2}` + :math:`z_i \\sim \\mathcal{N}(0, 1)`, + :math:`(u_i,v_i) \\sim \\mathcal{N}(0, \\Sigma^2_{u,v})`, + :math:`w_i \\sim \\mathcal{N}(0, 1)`. + + + The data generating process is inspired by a process used in the simulation study (see Appendix E) of Bia, + Huber and Lafférs (2023). + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + mar: + Boolean. Indicates whether missingness at random holds. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z, s)``. + + References + ---------- + Michela Bia, Martin Huber & Lukáš Lafférs (2023) Double Machine Learning for Sample Selection Models, + Journal of Business & Economic Statistics, DOI: 10.1080/07350015.2023.2271071 + """ + if mar: + sigma = np.array([[1, 0], [0, 1]]) + gamma = 0 + else: + sigma = np.array([[1, 0.8], [0.8, 1]]) + gamma = 1 + + e = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=n_obs).T + + cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + + beta = [0.4 / (k**2) for k in range(1, dim_x + 1)] + + d = np.where(np.dot(x, beta) + np.random.randn(n_obs) > 0, 1, 0) + z = np.random.randn(n_obs) + s = np.where(np.dot(x, beta) + d + gamma * z + e[0] > 0, 1, 0) + + y = np.dot(x, beta) + theta * d + e[1] + y[s == 0] = 0 + + if return_type in _array_alias: + return x, y, d, z, s + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + if mar: + data = pd.DataFrame(np.column_stack((x, y, d, s)), columns=x_cols + ["y", "d", "s"]) + else: + data = pd.DataFrame(np.column_stack((x, y, d, z, s)), columns=x_cols + ["y", "d", "z", "s"]) + if return_type in _data_frame_alias: + return data + else: + if mar: + return DoubleMLData(data, "y", "d", x_cols, None, None, "s") + return DoubleMLData(data, "y", "d", x_cols, "z", None, "s") + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/datasets/__init__.py b/doubleml/plm/datasets/__init__.py new file mode 100644 index 00000000..f8928902 --- /dev/null +++ b/doubleml/plm/datasets/__init__.py @@ -0,0 +1,20 @@ +""" +The :mod:`doubleml.plm.datasets` module implements data generating processes for partially linear models. +""" + +from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018 +from .dgp_plr_turrell2018 import make_plr_turrell2018 +from .dgp_confounded_plr_data import make_confounded_plr_data +from .dgp_pliv_CHS2015 import make_pliv_CHS2015 +from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021 +from ._make_pliv_data import _make_pliv_data + + +__all__ = [ + "make_plr_CCDDHNR2018", + "make_plr_turrell2018", + "make_confounded_plr_data", + "make_pliv_CHS2015", + "make_pliv_multiway_cluster_CKMS2021", + "_make_pliv_data", +] diff --git a/doubleml/plm/datasets/_make_pliv_data.py b/doubleml/plm/datasets/_make_pliv_data.py new file mode 100644 index 00000000..deb7cc53 --- /dev/null +++ b/doubleml/plm/datasets/_make_pliv_data.py @@ -0,0 +1,70 @@ +""" +Helper function for partially linear IV data generation. +""" + +import numpy as np +import pandas as pd +from sklearn.datasets import make_spd_matrix + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def _g(x): + return np.power(np.sin(x), 2) + + +def _m(x, nu=0.0, gamma=1.0): + return 0.5 / np.pi * (np.sinh(gamma)) / (np.cosh(gamma) - np.cos(x - nu)) + + +def _make_pliv_data(n_obs=100, dim_x=20, theta=0.5, gamma_z=0.4, return_type="DoubleMLData"): + b = [1 / k for k in range(1, dim_x + 1)] + sigma = make_spd_matrix(dim_x) + + x = np.random.multivariate_normal( + np.zeros(dim_x), + sigma, + size=[ + n_obs, + ], + ) + G = _g(np.dot(x, b)) + # instrument + z = _m(np.dot(x, b)) + np.random.standard_normal( + size=[ + n_obs, + ] + ) + # treatment + M = _m(gamma_z * z + np.dot(x, b)) + d = M + np.random.standard_normal( + size=[ + n_obs, + ] + ) + y = ( + np.dot(theta, d) + + G + + np.random.standard_normal( + size=[ + n_obs, + ] + ) + ) + + if return_type in _array_alias: + return x, y, d, z + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["y", "d", "z"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols, "z") + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/datasets/dgp_confounded_plr_data.py b/doubleml/plm/datasets/dgp_confounded_plr_data.py new file mode 100644 index 00000000..794e3db1 --- /dev/null +++ b/doubleml/plm/datasets/dgp_confounded_plr_data.py @@ -0,0 +1,171 @@ +import numpy as np +from scipy.linalg import toeplitz +from scipy.optimize import minimize_scalar + + +def make_confounded_plr_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04, **kwargs): + """ + Generates counfounded data from an partially linear regression model. + + The data generating process is defined as follows (similar to the Monte Carlo simulation used + in Sant'Anna and Zhao (2020)). Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, + where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where + + .. math:: + + \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) + + \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) + + \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 + + \\tilde{Z}_4 &= (20 + X_2 + X_4)^2. + + Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. + At first, define the treatment as + + .. math:: + + D = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A + \\varepsilon_D + + and with :math:`\\varepsilon \\sim \\mathcal{N}(0,1)`. + Since :math:`A` is independent of :math:`X`, the long and short form of the treatment regression are given as + + .. math:: + + E[D|X,A] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A + + E[D|X] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4. + + Further, generate the outcome of interest :math:`Y` as + + .. math:: + + Y &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A + \\varepsilon + + g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4) + + where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. + This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of + the conditional expectation take the following forms + + .. math:: + + \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A + + \\mathbb{E}[Y|D, X] &= (\\theta + \\gamma_A\\beta_A \\frac{\\mathrm{Var}(A)}{\\mathrm{Var}(D)}) \\cdot D + g(Z). + + Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`. + Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample). + + The observed data is given as :math:`W = (Y, D, X)`. + Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, the effect :math:`\\theta`, + the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the long and short forms of the main regression and + the propensity score are returned in a dictionary. + + Parameters + ---------- + n_obs : int + The number of observations to simulate. + Default is ``500``. + theta : float or int + Average treatment effect. + Default is ``5.0``. + cf_y : float + Percentage of the residual variation of the outcome explained by latent/confounding variable. + Default is ``0.04``. + cf_d : float + Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable. + Default is ``0.04``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. + + References + ---------- + Sant'Anna, P. H. and Zhao, J. (2020), + Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. + doi:`10.1016/j.jeconom.2020.06.003 `_. + """ + c = kwargs.get("c", 0.0) + dim_x = kwargs.get("dim_x", 4) + + # observed covariates + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + + # error terms + var_eps_y = 5 + eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) + var_eps_d = 1 + eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) + + # unobserved confounder + a_bounds = (-1, 1) + a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) + var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 + + # get the required impact of the confounder on the propensity score + m_short = -z[:, 0] + 0.5 * z[:, 1] - 0.25 * z[:, 2] - 0.1 * z[:, 3] + + def f_m(gamma_a): + rr_long = eps_d / var_eps_d + rr_short = (gamma_a * a + eps_d) / (gamma_a**2 * var_a + var_eps_d) + C2_D = (np.mean(np.square(rr_long)) - np.mean(np.square(rr_short))) / np.mean(np.square(rr_short)) + return np.square(C2_D / (1 + C2_D) - cf_d) + + gamma_a = minimize_scalar(f_m).x + m_long = m_short + gamma_a * a + d = m_long + eps_d + + # short and long version of g + g_partial_reg = 210 + 27.4 * z[:, 0] + 13.7 * (z[:, 1] + z[:, 2] + z[:, 3]) + + var_d = np.var(d) + + def f_g(beta_a): + g_diff = beta_a * (a - gamma_a * (var_a / var_d) * d) + y_diff = eps_y + g_diff + return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y) + + beta_a = minimize_scalar(f_g).x + + g_long = theta * d + g_partial_reg + beta_a * a + g_short = (theta + gamma_a * beta_a * var_a / var_d) * d + g_partial_reg + + y = g_long + eps_y + + oracle_values = { + "g_long": g_long, + "g_short": g_short, + "m_long": m_long, + "m_short": m_short, + "theta": theta, + "gamma_a": gamma_a, + "beta_a": beta_a, + "a": a, + "z": z, + } + + res_dict = {"x": x, "y": y, "d": d, "oracle_values": oracle_values} + + return res_dict diff --git a/doubleml/plm/datasets/dgp_pliv_CHS2015.py b/doubleml/plm/datasets/dgp_pliv_CHS2015.py new file mode 100644 index 00000000..7542803a --- /dev/null +++ b/doubleml/plm/datasets/dgp_pliv_CHS2015.py @@ -0,0 +1,108 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _array_alias, _data_frame_alias, _dml_data_alias + + +def make_pliv_CHS2015(n_obs, alpha=1.0, dim_x=200, dim_z=150, return_type="DoubleMLData"): + """ + Generates data from a partially linear IV regression model used in Chernozhukov, Hansen and Spindler (2015). + The data generating process is defined as + + .. math:: + + z_i &= \\Pi x_i + \\zeta_i, + + d_i &= x_i' \\gamma + z_i' \\delta + u_i, + + y_i &= \\alpha d_i + x_i' \\beta + \\varepsilon_i, + + with + + .. math:: + + \\left(\\begin{matrix} \\varepsilon_i \\\\ u_i \\\\ \\zeta_i \\\\ x_i \\end{matrix} \\right) \\sim + \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.6 & 0 & 0 \\\\ 0.6 & 1 & 0 & 0 \\\\ + 0 & 0 & 0.25 I_{p_n^z} & 0 \\\\ 0 & 0 & 0 & \\Sigma \\end{matrix} \\right) \\right) + + where :math:`\\Sigma` is a :math:`p_n^x \\times p_n^x` matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`I_{p_n^z}` is the :math:`p_n^z \\times p_n^z` identity matrix. + :math:`\\beta = \\gamma` is a :math:`p_n^x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}`, + :math:`\\delta` is a :math:`p_n^z`-vector with entries :math:`\\delta_j=\\frac{1}{j^2}` + and :math:`\\Pi = (I_{p_n^z}, 0_{p_n^z \\times (p_n^x - p_n^z)})`. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + alpha : + The value of the causal parameter. + dim_x : + The number of covariates. + dim_z : + The number of instruments. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. + + References + ---------- + Chernozhukov, V., Hansen, C. and Spindler, M. (2015), Post-Selection and Post-Regularization Inference in Linear + Models with Many Controls and Instruments. American Economic Review: Papers and Proceedings, 105 (5): 486-90. + """ + assert dim_x >= dim_z + # see https://assets.aeaweb.org/asset-server/articles-attachments/aer/app/10505/P2015_1022_app.pdf + xx = np.random.multivariate_normal( + np.zeros(2), + np.array([[1.0, 0.6], [0.6, 1.0]]), + size=[ + n_obs, + ], + ) + epsilon = xx[:, 0] + u = xx[:, 1] + + sigma = toeplitz([np.power(0.5, k) for k in range(0, dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + sigma, + size=[ + n_obs, + ], + ) + + I_z = np.eye(dim_z) + xi = np.random.multivariate_normal( + np.zeros(dim_z), + 0.25 * I_z, + size=[ + n_obs, + ], + ) + + beta = [1 / (k**2) for k in range(1, dim_x + 1)] + gamma = beta + delta = [1 / (k**2) for k in range(1, dim_z + 1)] + Pi = np.hstack((I_z, np.zeros((dim_z, dim_x - dim_z)))) + + z = np.dot(x, np.transpose(Pi)) + xi + d = np.dot(x, gamma) + np.dot(z, delta) + u + y = alpha * d + np.dot(x, beta) + epsilon + + if return_type in _array_alias: + return x, y, d, z + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + z_cols = [f"Z{i + 1}" for i in np.arange(dim_z)] + data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["y", "d"] + z_cols) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols, z_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py new file mode 100644 index 00000000..df2b4cbe --- /dev/null +++ b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py @@ -0,0 +1,199 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from doubleml.data import DoubleMLClusterData +from doubleml.utils._aliases import _array_alias, _data_frame_alias, _dml_cluster_data_alias + + +def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return_type="DoubleMLClusterData", **kwargs): + """ + Generates data from a partially linear IV regression model with multiway cluster sample used in Chiang et al. + (2021). The data generating process is defined as + + .. math:: + + Z_{ij} &= X_{ij}' \\xi_0 + V_{ij}, + + D_{ij} &= Z_{ij}' \\pi_{10} + X_{ij}' \\pi_{20} + v_{ij}, + + Y_{ij} &= D_{ij} \\theta + X_{ij}' \\zeta_0 + \\varepsilon_{ij}, + + with + + .. math:: + + X_{ij} &= (1 - \\omega_1^X - \\omega_2^X) \\alpha_{ij}^X + + \\omega_1^X \\alpha_{i}^X + \\omega_2^X \\alpha_{j}^X, + + \\varepsilon_{ij} &= (1 - \\omega_1^\\varepsilon - \\omega_2^\\varepsilon) \\alpha_{ij}^\\varepsilon + + \\omega_1^\\varepsilon \\alpha_{i}^\\varepsilon + \\omega_2^\\varepsilon \\alpha_{j}^\\varepsilon, + + v_{ij} &= (1 - \\omega_1^v - \\omega_2^v) \\alpha_{ij}^v + + \\omega_1^v \\alpha_{i}^v + \\omega_2^v \\alpha_{j}^v, + + V_{ij} &= (1 - \\omega_1^V - \\omega_2^V) \\alpha_{ij}^V + + \\omega_1^V \\alpha_{i}^V + \\omega_2^V \\alpha_{j}^V, + + and :math:`\\alpha_{ij}^X, \\alpha_{i}^X, \\alpha_{j}^X \\sim \\mathcal{N}(0, \\Sigma)` + where :math:`\\Sigma` is a :math:`p_x \\times p_x` matrix with entries + :math:`\\Sigma_{kj} = s_X^{|j-k|}`. + Further + + .. math:: + + \\left(\\begin{matrix} \\alpha_{ij}^\\varepsilon \\\\ \\alpha_{ij}^v \\end{matrix}\\right), + \\left(\\begin{matrix} \\alpha_{i}^\\varepsilon \\\\ \\alpha_{i}^v \\end{matrix}\\right), + \\left(\\begin{matrix} \\alpha_{j}^\\varepsilon \\\\ \\alpha_{j}^v \\end{matrix}\\right) + \\sim \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & s_{\\varepsilon v} \\\\ + s_{\\varepsilon v} & 1 \\end{matrix} \\right) \\right) + + + and :math:`\\alpha_{ij}^V, \\alpha_{i}^V, \\alpha_{j}^V \\sim \\mathcal{N}(0, 1)`. + + Parameters + ---------- + N : + The number of observations (first dimension). + M : + The number of observations (second dimension). + dim_X : + The number of covariates. + theta : + The value of the causal parameter. + return_type : + If ``'DoubleMLClusterData'`` or ``DoubleMLClusterData``, returns a ``DoubleMLClusterData`` object where + ``DoubleMLClusterData.data`` is a ``pd.DataFrame``. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s + ``(x, y, d, cluster_vars, z)``. + **kwargs + Additional keyword arguments to set non-default values for the parameters + :math:`\\pi_{10}=1.0`, :math:`\\omega_X = \\omega_{\\varepsilon} = \\omega_V = \\omega_v = (0.25, 0.25)`, + :math:`s_X = s_{\\varepsilon v} = 0.25`, + or the :math:`p_x`-vectors :math:`\\zeta_0 = \\pi_{20} = \\xi_0` with default entries + :math:`(\\zeta_{0})_j = 0.5^j`. + + References + ---------- + Chiang, H. D., Kato K., Ma, Y. and Sasaki, Y. (2021), Multiway Cluster Robust Double/Debiased Machine Learning, + Journal of Business & Economic Statistics, + doi: `10.1080/07350015.2021.1895815 `_, + arXiv:`1909.03489 `_. + """ + # additional parameters specifiable via kwargs + pi_10 = kwargs.get("pi_10", 1.0) + + xx = np.arange(1, dim_X + 1) + zeta_0 = kwargs.get("zeta_0", np.power(0.5, xx)) + pi_20 = kwargs.get("pi_20", np.power(0.5, xx)) + xi_0 = kwargs.get("xi_0", np.power(0.5, xx)) + + omega_X = kwargs.get("omega_X", np.array([0.25, 0.25])) + omega_epsilon = kwargs.get("omega_epsilon", np.array([0.25, 0.25])) + omega_v = kwargs.get("omega_v", np.array([0.25, 0.25])) + omega_V = kwargs.get("omega_V", np.array([0.25, 0.25])) + + s_X = kwargs.get("s_X", 0.25) + s_epsilon_v = kwargs.get("s_epsilon_v", 0.25) + + # use np.tile() and np.repeat() for repeating vectors in different styles, i.e., + # np.tile([v1, v2, v3], 2) [v1, v2, v3, v1, v2, v3] + # np.repeat([v1, v2, v3], 2) [v1, v1, v2, v2, v3, v3] + + alpha_V = np.random.normal(size=(N * M)) + alpha_V_i = np.repeat(np.random.normal(size=N), M) + alpha_V_j = np.tile(np.random.normal(size=M), N) + + cov_mat = np.array([[1, s_epsilon_v], [s_epsilon_v, 1]]) + alpha_eps_v = np.random.multivariate_normal( + np.zeros(2), + cov_mat, + size=[ + N * M, + ], + ) + alpha_eps = alpha_eps_v[:, 0] + alpha_v = alpha_eps_v[:, 1] + + alpha_eps_v_i = np.random.multivariate_normal( + np.zeros(2), + cov_mat, + size=[ + N, + ], + ) + alpha_eps_i = np.repeat(alpha_eps_v_i[:, 0], M) + alpha_v_i = np.repeat(alpha_eps_v_i[:, 1], M) + + alpha_eps_v_j = np.random.multivariate_normal( + np.zeros(2), + cov_mat, + size=[ + M, + ], + ) + alpha_eps_j = np.tile(alpha_eps_v_j[:, 0], N) + alpha_v_j = np.tile(alpha_eps_v_j[:, 1], N) + + cov_mat = toeplitz([np.power(s_X, k) for k in range(dim_X)]) + alpha_X = np.random.multivariate_normal( + np.zeros(dim_X), + cov_mat, + size=[ + N * M, + ], + ) + alpha_X_i = np.repeat( + np.random.multivariate_normal( + np.zeros(dim_X), + cov_mat, + size=[ + N, + ], + ), + M, + axis=0, + ) + alpha_X_j = np.tile( + np.random.multivariate_normal( + np.zeros(dim_X), + cov_mat, + size=[ + M, + ], + ), + (N, 1), + ) + + # generate variables + x = (1 - omega_X[0] - omega_X[1]) * alpha_X + omega_X[0] * alpha_X_i + omega_X[1] * alpha_X_j + + eps = ( + (1 - omega_epsilon[0] - omega_epsilon[1]) * alpha_eps + omega_epsilon[0] * alpha_eps_i + omega_epsilon[1] * alpha_eps_j + ) + + v = (1 - omega_v[0] - omega_v[1]) * alpha_v + omega_v[0] * alpha_v_i + omega_v[1] * alpha_v_j + + V = (1 - omega_V[0] - omega_V[1]) * alpha_V + omega_V[0] * alpha_V_i + omega_V[1] * alpha_V_j + + z = np.matmul(x, xi_0) + V + d = z * pi_10 + np.matmul(x, pi_20) + v + y = d * theta + np.matmul(x, zeta_0) + eps + + cluster_cols = ["cluster_var_i", "cluster_var_j"] + cluster_vars = pd.MultiIndex.from_product([range(N), range(M)]).to_frame(name=cluster_cols).reset_index(drop=True) + + if return_type in _array_alias: + return x, y, d, cluster_vars.values, z + elif return_type in _data_frame_alias + _dml_cluster_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_X)] + data = pd.concat((cluster_vars, pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["Y", "D", "Z"])), axis=1) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLClusterData(data, "Y", "D", cluster_cols, x_cols, "Z") + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/datasets/dgp_plr_CCDDHNR2018.py b/doubleml/plm/datasets/dgp_plr_CCDDHNR2018.py new file mode 100644 index 00000000..7d6fdf9e --- /dev/null +++ b/doubleml/plm/datasets/dgp_plr_CCDDHNR2018.py @@ -0,0 +1,108 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=0.5, return_type="DoubleMLData", **kwargs): + """ + Generates data from a partially linear regression model used in Chernozhukov et al. (2018) for Figure 1. + The data generating process is defined as + + .. math:: + + d_i &= m_0(x_i) + s_1 v_i, & &v_i \\sim \\mathcal{N}(0,1), + + y_i &= \\alpha d_i + g_0(x_i) + s_2 \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), + + + with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = 0.7^{|j-k|}`. + The nuisance functions are given by + + .. math:: + + m_0(x_i) &= a_0 x_{i,1} + a_1 \\frac{\\exp(x_{i,3})}{1+\\exp(x_{i,3})}, + + g_0(x_i) &= b_0 \\frac{\\exp(x_{i,1})}{1+\\exp(x_{i,1})} + b_1 x_{i,3}. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + alpha : + The value of the causal parameter. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. + **kwargs + Additional keyword arguments to set non-default values for the parameters + :math:`a_0=1`, :math:`a_1=0.25`, :math:`s_1=1`, :math:`b_0=1`, :math:`b_1=0.25` or :math:`s_2=1`. + + References + ---------- + Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), + Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. + doi:`10.1111/ectj.12097 `_. + """ + a_0 = kwargs.get("a_0", 1.0) + a_1 = kwargs.get("a_1", 0.25) + s_1 = kwargs.get("s_1", 1.0) + + b_0 = kwargs.get("b_0", 1.0) + b_1 = kwargs.get("b_1", 0.25) + s_2 = kwargs.get("s_2", 1.0) + + cov_mat = toeplitz([np.power(0.7, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + + d = ( + a_0 * x[:, 0] + + a_1 * np.divide(np.exp(x[:, 2]), 1 + np.exp(x[:, 2])) + + s_1 + * np.random.standard_normal( + size=[ + n_obs, + ] + ) + ) + y = ( + alpha * d + + b_0 * np.divide(np.exp(x[:, 0]), 1 + np.exp(x[:, 0])) + + b_1 * x[:, 2] + + s_2 + * np.random.standard_normal( + size=[ + n_obs, + ] + ) + ) + + if return_type in _array_alias: + return x, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + ["y", "d"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/datasets/dgp_plr_turrell2018.py b/doubleml/plm/datasets/dgp_plr_turrell2018.py new file mode 100644 index 00000000..5cfefdd8 --- /dev/null +++ b/doubleml/plm/datasets/dgp_plr_turrell2018.py @@ -0,0 +1,107 @@ +import numpy as np +import pandas as pd +from sklearn.datasets import make_spd_matrix + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def _g(x): + return np.power(np.sin(x), 2) + + +def _m(x, nu=0.0, gamma=1.0): + return 0.5 / np.pi * (np.sinh(gamma)) / (np.cosh(gamma) - np.cos(x - nu)) + + +def make_plr_turrell2018(n_obs=100, dim_x=20, theta=0.5, return_type="DoubleMLData", **kwargs): + """ + Generates data from a partially linear regression model used in a blog article by Turrell (2018). + The data generating process is defined as + + .. math:: + + d_i &= m_0(x_i' b) + v_i, & &v_i \\sim \\mathcal{N}(0,1), + + y_i &= \\theta d_i + g_0(x_i' b) + u_i, & &u_i \\sim \\mathcal{N}(0,1), + + + with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a random symmetric, + positive-definite matrix generated with :py:meth:`sklearn.datasets.make_spd_matrix`. + :math:`b` is a vector with entries :math:`b_j=\\frac{1}{j}` and the nuisance functions are given by + + .. math:: + + m_0(x_i) &= \\frac{1}{2 \\pi} \\frac{\\sinh(\\gamma)}{\\cosh(\\gamma) - \\cos(x_i-\\nu)}, + + g_0(x_i) &= \\sin(x_i)^2. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. + **kwargs + Additional keyword arguments to set non-default values for the parameters + :math:`\\nu=0`, or :math:`\\gamma=1`. + + References + ---------- + Turrell, A. (2018), Econometrics in Python part I - Double machine learning, Markov Wanderer: A blog on economics, + science, coding and data. `https://aeturrell.com/blog/posts/econometrics-in-python-parti-ml/ + `_. + """ + nu = kwargs.get("nu", 0.0) + gamma = kwargs.get("gamma", 1.0) + + b = [1 / k for k in range(1, dim_x + 1)] + sigma = make_spd_matrix(dim_x) + + x = np.random.multivariate_normal( + np.zeros(dim_x), + sigma, + size=[ + n_obs, + ], + ) + G = _g(np.dot(x, b)) + M = _m(np.dot(x, b), nu=nu, gamma=gamma) + d = M + np.random.standard_normal( + size=[ + n_obs, + ] + ) + y = ( + np.dot(theta, d) + + G + + np.random.standard_normal( + size=[ + n_obs, + ] + ) + ) + + if return_type in _array_alias: + return x, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + ["y", "d"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols) + else: + raise ValueError("Invalid return_type.") From 56d832c372fb2637632dc1711455a1239574c9e0 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 14:41:02 +0200 Subject: [PATCH 09/84] update tests acc. to Refactor Data Generators #306 --- doubleml/plm/tests/conftest.py | 2 +- .../plm/tests/test_pliv_external_predictions.py | 2 +- doubleml/plm/tests/test_plr_external_predictions.py | 2 +- doubleml/tests/conftest.py | 2 +- doubleml/tests/test_datasets.py | 13 +++++++------ doubleml/tests/test_evaluate_learner.py | 2 +- doubleml/tests/test_exceptions.py | 9 ++------- doubleml/tests/test_exceptions_ext_preds.py | 2 +- doubleml/tests/test_framework.py | 2 +- doubleml/tests/test_model_defaults.py | 9 ++------- doubleml/tests/test_multiway_cluster.py | 2 +- doubleml/tests/test_nonlinear_cluster.py | 3 ++- doubleml/tests/test_return_types.py | 10 ++-------- doubleml/tests/test_scores.py | 3 ++- doubleml/tests/test_sensitivity.py | 2 +- doubleml/tests/test_sensitivity_cluster.py | 2 +- doubleml/tests/test_set_ml_nuisance_params.py | 3 ++- doubleml/tests/test_set_sample_splitting.py | 2 +- 18 files changed, 30 insertions(+), 42 deletions(-) diff --git a/doubleml/plm/tests/conftest.py b/doubleml/plm/tests/conftest.py index 497d6fc9..cfde0f41 100644 --- a/doubleml/plm/tests/conftest.py +++ b/doubleml/plm/tests/conftest.py @@ -4,7 +4,7 @@ from scipy.linalg import toeplitz from sklearn.datasets import make_spd_matrix -from doubleml.datasets import make_pliv_CHS2015, make_plr_turrell2018 +from doubleml.plm.datasets import make_pliv_CHS2015, make_plr_turrell2018 def _g(x): diff --git a/doubleml/plm/tests/test_pliv_external_predictions.py b/doubleml/plm/tests/test_pliv_external_predictions.py index bc8a1e8a..55c362ab 100644 --- a/doubleml/plm/tests/test_pliv_external_predictions.py +++ b/doubleml/plm/tests/test_pliv_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression from doubleml import DoubleMLData, DoubleMLPLIV -from doubleml.datasets import make_pliv_CHS2015 +from doubleml.plm.datasets import make_pliv_CHS2015 from doubleml.utils import DMLDummyRegressor diff --git a/doubleml/plm/tests/test_plr_external_predictions.py b/doubleml/plm/tests/test_plr_external_predictions.py index 47644555..160052b1 100644 --- a/doubleml/plm/tests/test_plr_external_predictions.py +++ b/doubleml/plm/tests/test_plr_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression from doubleml import DoubleMLData, DoubleMLPLR -from doubleml.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.datasets import make_plr_CCDDHNR2018 from doubleml.utils import DMLDummyRegressor diff --git a/doubleml/tests/conftest.py b/doubleml/tests/conftest.py index bf53d788..6abea18c 100644 --- a/doubleml/tests/conftest.py +++ b/doubleml/tests/conftest.py @@ -4,7 +4,7 @@ from sklearn.datasets import make_classification, make_regression, make_spd_matrix from doubleml import DoubleMLData -from doubleml.datasets import make_pliv_CHS2015, make_plr_turrell2018 +from doubleml.plm.datasets import make_pliv_CHS2015, make_plr_turrell2018 def _g(x): diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py index 67f612e8..8f1c4f03 100644 --- a/doubleml/tests/test_datasets.py +++ b/doubleml/tests/test_datasets.py @@ -3,21 +3,22 @@ import pytest from doubleml import DoubleMLClusterData, DoubleMLData -from doubleml.datasets import ( - _make_pliv_data, - fetch_401K, - fetch_bonus, +from doubleml.datasets import fetch_401K, fetch_bonus +from doubleml.irm.datasets import ( make_confounded_irm_data, - make_confounded_plr_data, make_heterogeneous_data, make_iivm_data, make_irm_data, make_irm_data_discrete_treatments, + make_ssm_data, +) +from doubleml.plm.datasets import ( + _make_pliv_data, + make_confounded_plr_data, make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018, make_plr_turrell2018, - make_ssm_data, ) msg_inv_return_type = "Invalid return_type." diff --git a/doubleml/tests/test_evaluate_learner.py b/doubleml/tests/test_evaluate_learner.py index dbad9b62..2c5d3f9a 100644 --- a/doubleml/tests/test_evaluate_learner.py +++ b/doubleml/tests/test_evaluate_learner.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from doubleml.utils._estimation import _logloss np.random.seed(3141) diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index a4655bb9..d8fe4e7c 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -21,13 +21,8 @@ DoubleMLPQ, DoubleMLQTE, ) -from doubleml.datasets import ( - make_iivm_data, - make_irm_data, - make_pliv_CHS2015, - make_pliv_multiway_cluster_CKMS2021, - make_plr_CCDDHNR2018, -) +from doubleml.irm.datasets import make_iivm_data, make_irm_data +from doubleml.plm.datasets import make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 from doubleml.did.datasets import make_did_SZ2020 from ._utils import DummyDataClass diff --git a/doubleml/tests/test_exceptions_ext_preds.py b/doubleml/tests/test_exceptions_ext_preds.py index 3f600282..a65b6ebb 100644 --- a/doubleml/tests/test_exceptions_ext_preds.py +++ b/doubleml/tests/test_exceptions_ext_preds.py @@ -2,7 +2,7 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from doubleml import DoubleMLCVAR, DoubleMLData, DoubleMLIRM, DoubleMLQTE -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor df_irm = make_irm_data(n_obs=10, dim_x=2, theta=0.5, return_type="DataFrame") diff --git a/doubleml/tests/test_framework.py b/doubleml/tests/test_framework.py index 24810b68..44dabb71 100644 --- a/doubleml/tests/test_framework.py +++ b/doubleml/tests/test_framework.py @@ -3,7 +3,7 @@ import pytest from sklearn.linear_model import LinearRegression, LogisticRegression -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from doubleml.double_ml_framework import DoubleMLFramework, concat from doubleml.irm.irm import DoubleMLIRM diff --git a/doubleml/tests/test_model_defaults.py b/doubleml/tests/test_model_defaults.py index f55a555c..8417468a 100644 --- a/doubleml/tests/test_model_defaults.py +++ b/doubleml/tests/test_model_defaults.py @@ -4,13 +4,8 @@ from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml -from doubleml.datasets import ( - make_iivm_data, - make_irm_data, - make_pliv_CHS2015, - make_plr_CCDDHNR2018, - make_ssm_data, -) +from doubleml.irm.datasets import make_iivm_data, make_irm_data, make_ssm_data +from doubleml.plm.datasets import make_pliv_CHS2015, make_plr_CCDDHNR2018 from doubleml.did.datasets import make_did_SZ2020 np.random.seed(3141) diff --git a/doubleml/tests/test_multiway_cluster.py b/doubleml/tests/test_multiway_cluster.py index b064024f..10e5d445 100644 --- a/doubleml/tests/test_multiway_cluster.py +++ b/doubleml/tests/test_multiway_cluster.py @@ -6,7 +6,7 @@ from sklearn.linear_model import Lasso, LinearRegression import doubleml as dml -from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 +from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 from ..plm.tests._utils_pliv_manual import compute_pliv_residuals, fit_pliv from ._utils import _clone diff --git a/doubleml/tests/test_nonlinear_cluster.py b/doubleml/tests/test_nonlinear_cluster.py index f84f3e2e..71998941 100644 --- a/doubleml/tests/test_nonlinear_cluster.py +++ b/doubleml/tests/test_nonlinear_cluster.py @@ -7,7 +7,8 @@ from sklearn.linear_model import Lasso, LinearRegression import doubleml as dml -from doubleml.datasets import DoubleMLClusterData, make_pliv_multiway_cluster_CKMS2021 +from doubleml import DoubleMLClusterData +from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 from .test_nonlinear_score_mixin import DoubleMLPLRWithNonLinearScoreMixin diff --git a/doubleml/tests/test_return_types.py b/doubleml/tests/test_return_types.py index 11ebd624..03676b74 100644 --- a/doubleml/tests/test_return_types.py +++ b/doubleml/tests/test_return_types.py @@ -23,14 +23,8 @@ DoubleMLPQ, DoubleMLSSM, ) -from doubleml.datasets import ( - make_iivm_data, - make_irm_data, - make_pliv_CHS2015, - make_pliv_multiway_cluster_CKMS2021, - make_plr_CCDDHNR2018, - make_ssm_data, -) +from doubleml.irm.datasets import make_iivm_data, make_irm_data, make_ssm_data +from doubleml.plm.datasets import make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 from doubleml.did.datasets import make_did_SZ2020 np.random.seed(3141) diff --git a/doubleml/tests/test_scores.py b/doubleml/tests/test_scores.py index c3281702..0687546d 100644 --- a/doubleml/tests/test_scores.py +++ b/doubleml/tests/test_scores.py @@ -3,7 +3,8 @@ from sklearn.linear_model import Lasso, LogisticRegression from doubleml import DoubleMLIIVM, DoubleMLIRM, DoubleMLPLIV, DoubleMLPLR -from doubleml.datasets import make_iivm_data, make_irm_data, make_pliv_CHS2015, make_plr_CCDDHNR2018 +from doubleml.irm.datasets import make_iivm_data, make_irm_data +from doubleml.plm.datasets import make_pliv_CHS2015, make_plr_CCDDHNR2018 np.random.seed(3141) dml_data_plr = make_plr_CCDDHNR2018(n_obs=100) diff --git a/doubleml/tests/test_sensitivity.py b/doubleml/tests/test_sensitivity.py index e4b43495..a0e47c0d 100644 --- a/doubleml/tests/test_sensitivity.py +++ b/doubleml/tests/test_sensitivity.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from ._utils_doubleml_sensitivity_manual import doubleml_sensitivity_benchmark_manual, doubleml_sensitivity_manual diff --git a/doubleml/tests/test_sensitivity_cluster.py b/doubleml/tests/test_sensitivity_cluster.py index 65ec0d64..83f8c270 100644 --- a/doubleml/tests/test_sensitivity_cluster.py +++ b/doubleml/tests/test_sensitivity_cluster.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression import doubleml as dml -from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 +from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 from ._utils_doubleml_sensitivity_manual import doubleml_sensitivity_benchmark_manual diff --git a/doubleml/tests/test_set_ml_nuisance_params.py b/doubleml/tests/test_set_ml_nuisance_params.py index a189b184..055bcbff 100644 --- a/doubleml/tests/test_set_ml_nuisance_params.py +++ b/doubleml/tests/test_set_ml_nuisance_params.py @@ -3,7 +3,8 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from doubleml import DoubleMLCVAR, DoubleMLIIVM, DoubleMLIRM, DoubleMLLPQ, DoubleMLPLIV, DoubleMLPLR, DoubleMLPQ -from doubleml.datasets import make_iivm_data, make_irm_data, make_pliv_CHS2015, make_plr_CCDDHNR2018 +from doubleml.irm.datasets import make_iivm_data, make_irm_data +from doubleml.plm.datasets import make_pliv_CHS2015, make_plr_CCDDHNR2018 # set default and test values n_est_default = 100 diff --git a/doubleml/tests/test_set_sample_splitting.py b/doubleml/tests/test_set_sample_splitting.py index 97313a00..0995d831 100644 --- a/doubleml/tests/test_set_sample_splitting.py +++ b/doubleml/tests/test_set_sample_splitting.py @@ -3,7 +3,7 @@ from sklearn.linear_model import Lasso from doubleml import DoubleMLPLR -from doubleml.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.datasets import make_plr_CCDDHNR2018 np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=10) From 02adb2488ada05014dfcdf927c48cbd6e22b8758 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 14:43:50 +0200 Subject: [PATCH 10/84] update docstrings acc. to Refactor Data Generators #306 --- doubleml/irm/iivm.py | 2 +- doubleml/irm/irm.py | 2 +- doubleml/plm/pliv.py | 2 +- doubleml/plm/plr.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py index a43c0a03..70c09cde 100644 --- a/doubleml/irm/iivm.py +++ b/doubleml/irm/iivm.py @@ -80,7 +80,7 @@ class DoubleMLIIVM(LinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_iivm_data + >>> from doubleml.irm.datasets import make_iivm_data >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> np.random.seed(3141) >>> ml_g = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py index 9bf5ed35..10f6377c 100644 --- a/doubleml/irm/irm.py +++ b/doubleml/irm/irm.py @@ -84,7 +84,7 @@ class DoubleMLIRM(LinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_irm_data + >>> from doubleml.irm.datasets import make_irm_data >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> np.random.seed(3141) >>> ml_g = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) diff --git a/doubleml/plm/pliv.py b/doubleml/plm/pliv.py index ba022688..52cb796d 100644 --- a/doubleml/plm/pliv.py +++ b/doubleml/plm/pliv.py @@ -62,7 +62,7 @@ class DoubleMLPLIV(LinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_pliv_CHS2015 + >>> from doubleml.plm.datasets import make_pliv_CHS2015 >>> from sklearn.ensemble import RandomForestRegressor >>> from sklearn.base import clone >>> np.random.seed(3141) diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py index a81bac48..4a57dfcb 100644 --- a/doubleml/plm/plr.py +++ b/doubleml/plm/plr.py @@ -60,7 +60,7 @@ class DoubleMLPLR(LinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> from sklearn.ensemble import RandomForestRegressor >>> from sklearn.base import clone >>> np.random.seed(3141) From 39d4e7ea5098c02b64d44ed3edc08df76aa485fc Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 14:44:52 +0200 Subject: [PATCH 11/84] update docstrings acc. to Refactor Data Generators #306 --- doubleml/irm/apos.py | 2 +- doubleml/irm/cvar.py | 2 +- doubleml/irm/lpq.py | 2 +- doubleml/irm/pq.py | 2 +- doubleml/irm/qte.py | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py index 8099342a..2960e90d 100644 --- a/doubleml/irm/apos.py +++ b/doubleml/irm/apos.py @@ -673,7 +673,7 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> from sklearn.ensemble import RandomForestRegressor >>> from sklearn.base import clone >>> np.random.seed(3141) diff --git a/doubleml/irm/cvar.py b/doubleml/irm/cvar.py index d2aeaced..57347dce 100644 --- a/doubleml/irm/cvar.py +++ b/doubleml/irm/cvar.py @@ -82,7 +82,7 @@ class DoubleMLCVAR(LinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_irm_data + >>> from doubleml.irm.datasets import make_irm_data >>> from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor >>> np.random.seed(3141) >>> ml_g = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=10, min_samples_leaf=2) diff --git a/doubleml/irm/lpq.py b/doubleml/irm/lpq.py index c98e8fa2..f46fb38c 100644 --- a/doubleml/irm/lpq.py +++ b/doubleml/irm/lpq.py @@ -83,7 +83,7 @@ class DoubleMLLPQ(NonLinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_iivm_data + >>> from doubleml.irm.datasets import make_iivm_data >>> from sklearn.ensemble import RandomForestClassifier >>> np.random.seed(3141) >>> ml_g = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=10, min_samples_leaf=2) diff --git a/doubleml/irm/pq.py b/doubleml/irm/pq.py index f64dc471..d0425845 100644 --- a/doubleml/irm/pq.py +++ b/doubleml/irm/pq.py @@ -90,7 +90,7 @@ class DoubleMLPQ(NonLinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_irm_data + >>> from doubleml.irm.datasets import make_irm_data >>> from sklearn.ensemble import RandomForestClassifier >>> np.random.seed(3141) >>> ml_g = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=10, min_samples_leaf=2) diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py index 68b91a9a..a2c803a3 100644 --- a/doubleml/irm/qte.py +++ b/doubleml/irm/qte.py @@ -72,7 +72,7 @@ class DoubleMLQTE: -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_irm_data + >>> from doubleml.irm.datasets import make_irm_data >>> from sklearn.ensemble import RandomForestClassifier >>> np.random.seed(3141) >>> ml_g = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=10, min_samples_leaf=2) @@ -499,7 +499,7 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> from sklearn.ensemble import RandomForestRegressor >>> from sklearn.base import clone >>> np.random.seed(3141) From 83cfe9c88fe94cf172bdbcad1d67182f35957736 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 14:45:56 +0200 Subject: [PATCH 12/84] update irm submod tests acc. to Refactor Data Generators #306 --- doubleml/irm/tests/conftest.py | 2 +- doubleml/irm/tests/test_apo.py | 2 +- doubleml/irm/tests/test_iivm_external_predictions.py | 2 +- doubleml/irm/tests/test_irm.py | 2 +- doubleml/irm/tests/test_irm_external_predictions.py | 2 +- doubleml/irm/tests/test_lpq_external_predictions.py | 2 +- doubleml/irm/tests/test_pq_external_predictions.py | 2 +- doubleml/irm/tests/test_qte.py | 2 +- doubleml/irm/tests/test_qte_exceptions.py | 2 +- doubleml/irm/tests/test_ssm_exceptions.py | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/doubleml/irm/tests/conftest.py b/doubleml/irm/tests/conftest.py index 1cf1d525..0a3d4db8 100644 --- a/doubleml/irm/tests/conftest.py +++ b/doubleml/irm/tests/conftest.py @@ -4,7 +4,7 @@ from scipy.linalg import toeplitz from sklearn.datasets import make_spd_matrix -from doubleml.datasets import make_iivm_data, make_irm_data +from doubleml.irm.datasets import make_iivm_data, make_irm_data def _g(x): diff --git a/doubleml/irm/tests/test_apo.py b/doubleml/irm/tests/test_apo.py index df4ec284..7558b7c1 100644 --- a/doubleml/irm/tests/test_apo.py +++ b/doubleml/irm/tests/test_apo.py @@ -8,7 +8,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data, make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_irm_data, make_irm_data_discrete_treatments from ...tests._utils import draw_smpls from ._utils_apo_manual import boot_apo, fit_apo, fit_sensitivity_elements_apo diff --git a/doubleml/irm/tests/test_iivm_external_predictions.py b/doubleml/irm/tests/test_iivm_external_predictions.py index 7f4626e9..d71d2bb5 100644 --- a/doubleml/irm/tests/test_iivm_external_predictions.py +++ b/doubleml/irm/tests/test_iivm_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from doubleml import DoubleMLData, DoubleMLIIVM -from doubleml.datasets import make_iivm_data +from doubleml.irm.datasets import make_iivm_data from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor diff --git a/doubleml/irm/tests/test_irm.py b/doubleml/irm/tests/test_irm.py index f99f2253..856c7f59 100644 --- a/doubleml/irm/tests/test_irm.py +++ b/doubleml/irm/tests/test_irm.py @@ -8,7 +8,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from doubleml.utils.resampling import DoubleMLResampling from ...tests._utils import draw_smpls diff --git a/doubleml/irm/tests/test_irm_external_predictions.py b/doubleml/irm/tests/test_irm_external_predictions.py index dabf6c0e..5d0412d5 100644 --- a/doubleml/irm/tests/test_irm_external_predictions.py +++ b/doubleml/irm/tests/test_irm_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from doubleml import DoubleMLData, DoubleMLIRM -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor diff --git a/doubleml/irm/tests/test_lpq_external_predictions.py b/doubleml/irm/tests/test_lpq_external_predictions.py index 66f2ece6..48cb42f5 100644 --- a/doubleml/irm/tests/test_lpq_external_predictions.py +++ b/doubleml/irm/tests/test_lpq_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LogisticRegression from doubleml import DoubleMLData, DoubleMLLPQ -from doubleml.datasets import make_iivm_data +from doubleml.irm.datasets import make_iivm_data from doubleml.utils import DMLDummyClassifier from ...tests._utils import draw_smpls diff --git a/doubleml/irm/tests/test_pq_external_predictions.py b/doubleml/irm/tests/test_pq_external_predictions.py index 28f8ec66..9674c464 100644 --- a/doubleml/irm/tests/test_pq_external_predictions.py +++ b/doubleml/irm/tests/test_pq_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LogisticRegression from doubleml import DoubleMLData, DoubleMLPQ -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from doubleml.utils import DMLDummyClassifier from ...tests._utils import draw_smpls diff --git a/doubleml/irm/tests/test_qte.py b/doubleml/irm/tests/test_qte.py index 0557c85b..7fcbeec2 100644 --- a/doubleml/irm/tests/test_qte.py +++ b/doubleml/irm/tests/test_qte.py @@ -8,7 +8,7 @@ from sklearn.linear_model import LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from ...tests._utils import confint_manual, draw_smpls from ...utils._estimation import _default_kde diff --git a/doubleml/irm/tests/test_qte_exceptions.py b/doubleml/irm/tests/test_qte_exceptions.py index 9f94f5d4..f4e95110 100644 --- a/doubleml/irm/tests/test_qte_exceptions.py +++ b/doubleml/irm/tests/test_qte_exceptions.py @@ -6,7 +6,7 @@ from doubleml import DoubleMLData, DoubleMLQTE from doubleml.data.base_data import DoubleMLBaseData -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data np.random.seed(42) n = 100 diff --git a/doubleml/irm/tests/test_ssm_exceptions.py b/doubleml/irm/tests/test_ssm_exceptions.py index 6ff276e3..50b082ec 100644 --- a/doubleml/irm/tests/test_ssm_exceptions.py +++ b/doubleml/irm/tests/test_ssm_exceptions.py @@ -6,7 +6,7 @@ from doubleml import DoubleMLSSM from doubleml.data.base_data import DoubleMLBaseData -from doubleml.datasets import make_ssm_data +from doubleml.irm.datasets import make_ssm_data np.random.seed(3141) n = 100 From 3ff0edbbae50149d70fdb2b1ccc7f8d8cda75bc9 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 14:47:01 +0200 Subject: [PATCH 13/84] update irm submod tests acc. to Refactor Data Generators #306 --- doubleml/irm/tests/test_apo_exceptions.py | 2 +- doubleml/irm/tests/test_apo_external_predictions.py | 2 +- doubleml/irm/tests/test_apos.py | 2 +- doubleml/irm/tests/test_apos_classfier.py | 2 +- doubleml/irm/tests/test_apos_exceptions.py | 2 +- doubleml/irm/tests/test_apos_external_predictions.py | 2 +- doubleml/irm/tests/test_apos_weighted_scores.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doubleml/irm/tests/test_apo_exceptions.py b/doubleml/irm/tests/test_apo_exceptions.py index cfb6e93b..e643efca 100644 --- a/doubleml/irm/tests/test_apo_exceptions.py +++ b/doubleml/irm/tests/test_apo_exceptions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import Lasso, LogisticRegression from doubleml import DoubleMLAPO, DoubleMLData -from doubleml.datasets import make_iivm_data, make_irm_data, make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_iivm_data, make_irm_data, make_irm_data_discrete_treatments n = 100 data_apo = make_irm_data_discrete_treatments(n_obs=n) diff --git a/doubleml/irm/tests/test_apo_external_predictions.py b/doubleml/irm/tests/test_apo_external_predictions.py index 2bbe50e8..246ef021 100644 --- a/doubleml/irm/tests/test_apo_external_predictions.py +++ b/doubleml/irm/tests/test_apo_external_predictions.py @@ -6,7 +6,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from doubleml import DoubleMLAPO, DoubleMLData -from doubleml.datasets import make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_irm_data_discrete_treatments from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor from ...tests._utils import draw_smpls diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py index 746cb63c..55a48ced 100644 --- a/doubleml/irm/tests/test_apos.py +++ b/doubleml/irm/tests/test_apos.py @@ -6,7 +6,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data, make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_irm_data, make_irm_data_discrete_treatments from ...tests._utils import confint_manual from ._utils_apos_manual import boot_apos, fit_apos diff --git a/doubleml/irm/tests/test_apos_classfier.py b/doubleml/irm/tests/test_apos_classfier.py index 06fdc308..f9cfc10c 100644 --- a/doubleml/irm/tests/test_apos_classfier.py +++ b/doubleml/irm/tests/test_apos_classfier.py @@ -6,7 +6,7 @@ from sklearn.linear_model import LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_irm_data_discrete_treatments from ...tests._utils import confint_manual from ._utils_apos_manual import boot_apos, fit_apos diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py index c309b7e2..f1c9b3d6 100644 --- a/doubleml/irm/tests/test_apos_exceptions.py +++ b/doubleml/irm/tests/test_apos_exceptions.py @@ -4,7 +4,7 @@ from sklearn.linear_model import Lasso, LogisticRegression from doubleml import DoubleMLAPOS, DoubleMLData -from doubleml.datasets import make_iivm_data, make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_iivm_data, make_irm_data_discrete_treatments n = 100 data = make_irm_data_discrete_treatments(n_obs=n) diff --git a/doubleml/irm/tests/test_apos_external_predictions.py b/doubleml/irm/tests/test_apos_external_predictions.py index 9e97de07..ed4323ad 100644 --- a/doubleml/irm/tests/test_apos_external_predictions.py +++ b/doubleml/irm/tests/test_apos_external_predictions.py @@ -6,7 +6,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from doubleml import DoubleMLAPOS, DoubleMLData -from doubleml.datasets import make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_irm_data_discrete_treatments from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor from ...tests._utils import draw_smpls diff --git a/doubleml/irm/tests/test_apos_weighted_scores.py b/doubleml/irm/tests/test_apos_weighted_scores.py index ea612dec..6d0a7f65 100644 --- a/doubleml/irm/tests/test_apos_weighted_scores.py +++ b/doubleml/irm/tests/test_apos_weighted_scores.py @@ -6,7 +6,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_irm_data_discrete_treatments @pytest.fixture( From caa530e523ff49c07f1447d432868385bb48b685 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 14:59:02 +0200 Subject: [PATCH 14/84] update irm submod tests acc. to Refactor Data Generators #306 --- doubleml/data/tests/conftest.py | 3 ++- doubleml/data/tests/test_cluster_data.py | 2 +- doubleml/data/tests/test_dml_data.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doubleml/data/tests/conftest.py b/doubleml/data/tests/conftest.py index 6960b58a..fcefabce 100644 --- a/doubleml/data/tests/conftest.py +++ b/doubleml/data/tests/conftest.py @@ -2,7 +2,8 @@ import pandas as pd import pytest -from doubleml.datasets import make_irm_data, make_plr_turrell2018 +from doubleml.irm.datasets import make_irm_data +from doubleml.plm.datasets import make_plr_turrell2018 @pytest.fixture(scope="session", params=[(500, 10), (1000, 20), (1000, 100)]) diff --git a/doubleml/data/tests/test_cluster_data.py b/doubleml/data/tests/test_cluster_data.py index e95dfa03..b02a3275 100644 --- a/doubleml/data/tests/test_cluster_data.py +++ b/doubleml/data/tests/test_cluster_data.py @@ -3,7 +3,7 @@ import pytest from doubleml import DoubleMLClusterData -from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 +from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 @pytest.mark.ci diff --git a/doubleml/data/tests/test_dml_data.py b/doubleml/data/tests/test_dml_data.py index 7cf394b5..a2ada74b 100644 --- a/doubleml/data/tests/test_dml_data.py +++ b/doubleml/data/tests/test_dml_data.py @@ -5,12 +5,12 @@ from doubleml import DoubleMLData, DoubleMLDIDCS, DoubleMLPLR, DoubleMLSSM from doubleml.data.base_data import DoubleMLBaseData -from doubleml.datasets import ( +from doubleml.plm.datasets import ( _make_pliv_data, make_pliv_CHS2015, make_plr_CCDDHNR2018, - make_ssm_data, ) +from doubleml.irm.datasets import make_ssm_data from doubleml.did.datasets import make_did_SZ2020 From 4cb9148833d05ab409053e29f0890d7df79299c9 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 14:59:07 +0200 Subject: [PATCH 15/84] update docstrings acc. to Refactor Data Generators #306 --- doubleml/double_ml.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 764865a4..fe4cec5d 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -1167,10 +1167,9 @@ def evaluate_learners(self, learners=None, metric=_rmse): Examples -------- - >>> import numpy as np - >>> import doubleml as dml + >>> import numpy as np >>> import doubleml as dml >>> from sklearn.metrics import mean_absolute_error - >>> from doubleml.datasets import make_irm_data + >>> from doubleml.irm.datasets import make_irm_data >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> np.random.seed(3141) >>> ml_g = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) @@ -1284,10 +1283,9 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): self : object Examples - -------- - >>> import numpy as np + -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> from sklearn.ensemble import RandomForestRegressor >>> from sklearn.base import clone >>> np.random.seed(3141) From 312f601408e70c48cdb6f71b2803cfe570f21169 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 14:59:13 +0200 Subject: [PATCH 16/84] update docstrings acc. to Refactor Data Generators #306 --- doubleml/did/did_cs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py index ab2af5b9..7f33210f 100644 --- a/doubleml/did/did_cs.py +++ b/doubleml/did/did_cs.py @@ -63,10 +63,9 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML): Default is ``True``. Examples - -------- - >>> import numpy as np + -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_did_SZ2020 + >>> from doubleml.did.datasets import make_did_SZ2020 >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> np.random.seed(42) >>> ml_g = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=5) From 0d07790466aff6674e3d3d34163643ae26f025f8 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 15:00:40 +0200 Subject: [PATCH 17/84] update docstrings acc. to Refactor Data Generators #306 --- doubleml/data/base_data.py | 10 ++++------ doubleml/data/cluster_data.py | 5 ++--- doubleml/did/did.py | 5 ++--- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/doubleml/data/base_data.py b/doubleml/data/base_data.py index 318508e9..7a114220 100644 --- a/doubleml/data/base_data.py +++ b/doubleml/data/base_data.py @@ -135,9 +135,8 @@ class DoubleMLData(DoubleMLBaseData): Default is ``True``. Examples - -------- - >>> from doubleml import DoubleMLData - >>> from doubleml.datasets import make_plr_CCDDHNR2018 + -------- >>> from doubleml import DoubleMLData + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> # initialization from pandas.DataFrame >>> df = make_plr_CCDDHNR2018(return_type='DataFrame') >>> obj_dml_data_from_df = DoubleMLData(df, 'y', 'd') @@ -266,9 +265,8 @@ def from_arrays( Default is ``True``. Examples - -------- - >>> from doubleml import DoubleMLData - >>> from doubleml.datasets import make_plr_CCDDHNR2018 + -------- >>> from doubleml import DoubleMLData + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) """ diff --git a/doubleml/data/cluster_data.py b/doubleml/data/cluster_data.py index 658ab0cc..2cb9fb4f 100644 --- a/doubleml/data/cluster_data.py +++ b/doubleml/data/cluster_data.py @@ -61,9 +61,8 @@ class DoubleMLClusterData(DoubleMLData): Default is ``True``. Examples - -------- - >>> from doubleml import DoubleMLClusterData - >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 + -------- >>> from doubleml import DoubleMLClusterData + >>> from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 >>> # initialization from pandas.DataFrame >>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame') >>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z') diff --git a/doubleml/did/did.py b/doubleml/did/did.py index 7a671993..170535ea 100644 --- a/doubleml/did/did.py +++ b/doubleml/did/did.py @@ -63,10 +63,9 @@ class DoubleMLDID(LinearScoreMixin, DoubleML): Default is ``True``. Examples - -------- - >>> import numpy as np + -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_did_SZ2020 + >>> from doubleml.did.datasets import make_did_SZ2020 >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> np.random.seed(42) >>> ml_g = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=5) From 8b4f4bcd63876abecbce07bce21aa300ca4b9790 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 15:15:51 +0200 Subject: [PATCH 18/84] update documentations acc. to Refactor Data Generators #306 --- .github/ISSUE_TEMPLATE/bug_report.yml | 6 ++---- CONTRIBUTING.md | 2 +- doubleml/data/cluster_data.py | 5 ++--- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index baa6d625..3e5321ea 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -23,12 +23,10 @@ body: attributes: label: Minimum reproducible code snippet description: | - Please provide a short reproducible code snippet. Example: - - ```python + Please provide a short reproducible code snippet. Example: ```python import numpy as np import doubleml as dml - from doubleml.datasets import make_plr_CCDDHNR2018 + from doubleml.plm.datasets import make_plr_CCDDHNR2018 from sklearn.ensemble import RandomForestRegressor from sklearn.base import clone np.random.seed(3141) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4809c62a..a614dd73 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,7 +15,7 @@ To submit a **bug report**, you can use our ```python import numpy as np import doubleml as dml -from doubleml.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.datasets import make_plr_CCDDHNR2018 from sklearn.ensemble import RandomForestRegressor from sklearn.base import clone np.random.seed(3141) diff --git a/doubleml/data/cluster_data.py b/doubleml/data/cluster_data.py index 2cb9fb4f..89947b73 100644 --- a/doubleml/data/cluster_data.py +++ b/doubleml/data/cluster_data.py @@ -171,9 +171,8 @@ def from_arrays( Default is ``True``. Examples - -------- - >>> from doubleml import DoubleMLClusterData - >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 + -------- >>> from doubleml import DoubleMLClusterData + >>> from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) """ From 5c443952cfe497bfebea0135e4af1e2345148b9f Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 15:39:43 +0200 Subject: [PATCH 19/84] update tests acc. to Refactor Data Generators #306 --- doubleml/plm/tests/test_plr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/plm/tests/test_plr.py b/doubleml/plm/tests/test_plr.py index 79f21f84..65f5ad83 100644 --- a/doubleml/plm/tests/test_plr.py +++ b/doubleml/plm/tests/test_plr.py @@ -304,7 +304,7 @@ def test_dml_plr_cate_gate(score, cov_type): # collect data np.random.seed(42) - obj_dml_data = dml.datasets.make_plr_CCDDHNR2018(n_obs=n) + obj_dml_data = dml.plm.datasets.make_plr_CCDDHNR2018(n_obs=n) ml_l = LinearRegression() ml_g = LinearRegression() ml_m = LinearRegression() From a9f428474950aa1703f52138b9d298ac62989dfa Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 22:59:07 +0200 Subject: [PATCH 20/84] upd --- doubleml/data/__init__.py | 8 +- doubleml/data/base_data.py | 290 ++++++++---------- doubleml/data/base_data_content.txt | Bin 0 -> 60862 bytes doubleml/data/cluster_data.py | 8 +- doubleml/data/did_data.py | 272 ++++++++++++++++ doubleml/data/panel_data.py | 50 ++- doubleml/data/rdd_data.py | 272 ++++++++++++++++ doubleml/data/ssm_data.py | 274 +++++++++++++++++ doubleml/data/tests/test_cluster_data.py | 139 ++++----- .../dgp_pliv_multiway_cluster_CKMS2021.py | 8 +- doubleml/tests/test_multiway_cluster.py | 9 +- doubleml/tests/test_nonlinear_cluster.py | 13 +- doubleml/tests/test_sensitivity_cluster.py | 4 +- doubleml/utils/_aliases.py | 5 +- 14 files changed, 1077 insertions(+), 275 deletions(-) create mode 100644 doubleml/data/base_data_content.txt create mode 100644 doubleml/data/did_data.py create mode 100644 doubleml/data/rdd_data.py create mode 100644 doubleml/data/ssm_data.py diff --git a/doubleml/data/__init__.py b/doubleml/data/__init__.py index d8a920c6..dfe673e7 100644 --- a/doubleml/data/__init__.py +++ b/doubleml/data/__init__.py @@ -3,11 +3,15 @@ """ from .base_data import DoubleMLData -from .cluster_data import DoubleMLClusterData +from .did_data import DoubleMLDIDData from .panel_data import DoubleMLPanelData +from .rdd_data import DoubleMLRDDData +from .ssm_data import DoubleMLSSMData __all__ = [ "DoubleMLData", - "DoubleMLClusterData", + "DoubleMLDIDData", "DoubleMLPanelData", + "DoubleMLRDDData", + "DoubleMLSSMData", ] diff --git a/doubleml/data/base_data.py b/doubleml/data/base_data.py index 7a114220..8d585633 100644 --- a/doubleml/data/base_data.py +++ b/doubleml/data/base_data.py @@ -4,14 +4,15 @@ import numpy as np import pandas as pd from sklearn.utils import assert_all_finite -from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import check_array, check_consistent_length, column_or_1d from doubleml.utils._estimation import _assure_2d_array class DoubleMLBaseData(ABC): - """Base Class Double machine learning data-backends""" + """Bas x_cols = [f"X{i + 1}" for i in np.arange(x.shape[1])] + # baseline version with features, outcome and treatments + data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + [y_col] + d_cols)Class Double machine learning data-backends""" def __init__(self, data): if not isinstance(data, pd.DataFrame): @@ -98,24 +99,23 @@ class DoubleMLData(DoubleMLBaseData): x_cols : None, str or list The covariates. If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor - treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. + treatment variables ``d_cols``, nor instrumental variables ``z_cols``, nor cluster variables ``cluster_cols`` + are used as covariates. Default is ``None``. z_cols : None, str or list The instrumental variable(s). + Default is ``None``. cluster_cols : None, str or list + The cluster variable(s). Default is ``None``. - - t_col : None or str - The time variable (only relevant/used for DiD Estimators). - Default is ``None``. - - s_col : None or str - The score or selection variable (only relevant/used for RDD or SSM Estimatiors). - Default is ``None``. - + use_other_treat_as_covariate : bool Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. Default is ``True``. + + is_cluster_data : bool + Flag indicating whether this data object is being used for cluster data. + Default is ``False``. force_all_x_finite : bool or str Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. @@ -135,7 +135,8 @@ class DoubleMLData(DoubleMLBaseData): Default is ``True``. Examples - -------- >>> from doubleml import DoubleMLData + -------- + >>> from doubleml import DoubleMLData >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> # initialization from pandas.DataFrame >>> df = make_plr_CCDDHNR2018(return_type='DataFrame') @@ -152,27 +153,29 @@ def __init__( d_cols, x_cols=None, z_cols=None, - t_col=None, - s_col=None, + cluster_cols=None, use_other_treat_as_covariate=True, force_all_x_finite=True, force_all_d_finite=True, + is_cluster_data=False, ): DoubleMLBaseData.__init__(self, data) self.y_col = y_col self.d_cols = d_cols self.z_cols = z_cols - self.t_col = t_col - self.s_col = s_col + self.cluster_cols = cluster_cols self.x_cols = x_cols + self.is_cluster_data = is_cluster_data self._check_disjoint_sets() self.use_other_treat_as_covariate = use_other_treat_as_covariate self.force_all_x_finite = force_all_x_finite self.force_all_d_finite = force_all_d_finite self._binary_treats = self._check_binary_treats() self._binary_outcome = self._check_binary_outcome() - self._set_y_z_t_s() + self._set_y_z() + if self.cluster_cols is not None: + self._set_cluster_vars() # by default, we initialize to the first treatment variable self.set_x_d(self.d_cols[0]) @@ -188,7 +191,7 @@ def __str__(self): + "\n------------------ DataFrame info ------------------\n" + df_info ) - return res + return res def _data_summary_str(self): data_summary = ( @@ -197,10 +200,12 @@ def _data_summary_str(self): f"Covariates: {self.x_cols}\n" f"Instrument variable(s): {self.z_cols}\n" ) - if self.t_col is not None: - data_summary += f"Time variable: {self.t_col}\n" - if self.s_col is not None: - data_summary += f"Score/Selection variable: {self.s_col}\n" + + if self.cluster_cols is not None: + data_summary += f"Cluster variable(s): {self.cluster_cols}\n" + + if hasattr(self, 'is_cluster_data') and self.is_cluster_data: + data_summary += f"Is cluster data: {self.is_cluster_data}\n" data_summary += f"No. Observations: {self.n_obs}\n" return data_summary @@ -211,11 +216,11 @@ def from_arrays( y, d, z=None, - t=None, - s=None, + cluster_vars=None, use_other_treat_as_covariate=True, force_all_x_finite=True, force_all_d_finite=True, + is_cluster_data=False, ): """ Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s. @@ -229,18 +234,12 @@ def from_arrays( Array of the outcome variable. d : :class:`numpy.ndarray` - Array of treatment variables. - - z : None or :class:`numpy.ndarray` + Array of treatment variables. z : None or :class:`numpy.ndarray` Array of instrumental variables. Default is ``None``. - t : :class:`numpy.ndarray` - Array of the time variable (only relevant/used for DiD models). - Default is ``None``. - - s : :class:`numpy.ndarray` - Array of the score or selection variable (only relevant/used for RDD and SSM models). + cluster_vars : None or :class:`numpy.ndarray` + Array of cluster variables. Default is ``None``. use_other_treat_as_covariate : bool @@ -300,6 +299,7 @@ def from_arrays( d = _assure_2d_array(d) y_col = "y" + if z is None: check_consistent_length(x, y, d) z_cols = None @@ -312,39 +312,30 @@ def from_arrays( else: z_cols = [f"z{i + 1}" for i in np.arange(z.shape[1])] - if t is None: - t_col = None - else: - t = column_or_1d(t, warn=True) - check_consistent_length(x, y, d, t) - t_col = "t" - - if s is None: - s_col = None + if cluster_vars is None: + cluster_cols = None else: - s = column_or_1d(s, warn=True) - check_consistent_length(x, y, d, s) - s_col = "s" + cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False) + cluster_vars = _assure_2d_array(cluster_vars) + check_consistent_length(x, y, d, cluster_vars) + if cluster_vars.shape[1] == 1: + cluster_cols = ["cluster_var"] + else: + cluster_cols = [f"cluster_var{i + 1}" for i in np.arange(cluster_vars.shape[1])] if d.shape[1] == 1: d_cols = ["d"] else: d_cols = [f"d{i + 1}" for i in np.arange(d.shape[1])] - x_cols = [f"X{i + 1}" for i in np.arange(x.shape[1])] - - # baseline version with features, outcome and treatments + x_cols = [f"X{i + 1}" for i in np.arange(x.shape[1])] # baseline version with features, outcome and treatments data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + [y_col] + d_cols) - if z is not None: df_z = pd.DataFrame(z, columns=z_cols) data = pd.concat([data, df_z], axis=1) - - if t is not None: - data[t_col] = t - - if s is not None: - data[s_col] = s + if cluster_vars is not None: + df_cluster = pd.DataFrame(cluster_vars, columns=cluster_cols) + data = pd.concat([data, df_cluster], axis=1) return cls( data, @@ -352,11 +343,11 @@ def from_arrays( d_cols, x_cols, z_cols, - t_col, - s_col, + cluster_cols, use_other_treat_as_covariate, force_all_x_finite, force_all_d_finite, + is_cluster_data, ) @property @@ -397,24 +388,35 @@ def z(self): return None @property - def t(self): + def cluster_cols(self): """ - Array of time variable. + The cluster variable(s). """ - if self.t_col is not None: - return self._t.values - else: - return None + return self._cluster_cols - @property - def s(self): - """ - Array of score or selection variable. - """ - if self.s_col is not None: - return self._s.values + @cluster_cols.setter + def cluster_cols(self, value): + reset_value = hasattr(self, "_cluster_cols") + if value is not None: + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise TypeError( + "The cluster variable(s) cluster_cols must be of str or list type (or None). " + f"{str(value)} of type {str(type(value))} was passed." + ) + if not len(set(value)) == len(value): + raise ValueError("Invalid cluster variable(s) cluster_cols: Contains duplicate values.") + if not set(value).issubset(set(self.all_variables)): + raise ValueError("Invalid cluster variable(s) cluster_cols. At least one cluster variable is no data column.") + self._cluster_cols = value else: - return None + self._cluster_cols = None + + if reset_value: + self._check_disjoint_sets() + if self.cluster_cols is not None: + self._set_cluster_vars() @property def n_treat(self): @@ -538,7 +540,7 @@ def y_col(self, value): self._y_col = value if reset_value: self._check_disjoint_sets() - self._set_y_z_t_s() + self._set_y_z() @property def z_cols(self): @@ -567,59 +569,30 @@ def z_cols(self, value): self._z_cols = value else: self._z_cols = None + if reset_value: self._check_disjoint_sets() - self._set_y_z_t_s() + self._set_y_z() @property - def t_col(self): + def n_cluster_vars(self): """ - The time variable. + The number of cluster variables. """ - return self._t_col - - @t_col.setter - def t_col(self, value): - reset_value = hasattr(self, "_t_col") - if value is not None: - if not isinstance(value, str): - raise TypeError( - "The time variable t_col must be of str type (or None). " - f"{str(value)} of type {str(type(value))} was passed." - ) - if value not in self.all_variables: - raise ValueError(f"Invalid time variable t_col. {value} is no data column.") - self._t_col = value + if self.cluster_cols is not None: + return len(self.cluster_cols) else: - self._t_col = None - if reset_value: - self._check_disjoint_sets() - self._set_y_z_t_s() + return 0 @property - def s_col(self): + def cluster_vars(self): """ - The score or selection variable. + Array of cluster variable(s). """ - return self._s_col - - @s_col.setter - def s_col(self, value): - reset_value = hasattr(self, "_s_col") - if value is not None: - if not isinstance(value, str): - raise TypeError( - "The score or selection variable s_col must be of str type (or None). " - f"{str(value)} of type {str(type(value))} was passed." - ) - if value not in self.all_variables: - raise ValueError(f"Invalid score or selection variable s_col. {value} is no data column.") - self._s_col = value + if self.cluster_cols is not None: + return self._cluster_vars.values else: - self._s_col = None - if reset_value: - self._check_disjoint_sets() - self._set_y_z_t_s() + return None @property def use_other_treat_as_covariate(self): @@ -684,7 +657,7 @@ def force_all_d_finite(self, value): # by default, we initialize to the first treatment variable self.set_x_d(self.d_cols[0]) - def _set_y_z_t_s(self): + def _set_y_z(self): def _set_attr(col): if col is None: return None @@ -693,8 +666,12 @@ def _set_attr(col): self._y = _set_attr(self.y_col) self._z = _set_attr(self.z_cols) - self._t = _set_attr(self.t_col) - self._s = _set_attr(self.s_col) + + def _set_cluster_vars(self): + """Set cluster variables.""" + if self.cluster_cols is not None: + assert_all_finite(self.data.loc[:, self.cluster_cols]) + self._cluster_vars = self.data.loc[:, self.cluster_cols] def set_x_d(self, treatment_var): """ @@ -728,40 +705,15 @@ def set_x_d(self, treatment_var): def _get_optional_col_sets(self): # this function can be extended in inherited subclasses z_cols_set = set(self.z_cols or []) - t_col_set = {self.t_col} if self.t_col else set() - s_col_set = {self.s_col} if self.s_col else set() - - return [z_cols_set, t_col_set, s_col_set] - - def _check_binary_treats(self): - is_binary = pd.Series(dtype=bool, index=self.d_cols) - if not self.force_all_d_finite: - is_binary[:] = False # if we allow infinite values, we cannot check for binary - else: - for treatment_var in self.d_cols: - this_d = self.data.loc[:, treatment_var] - binary_treat = type_of_target(this_d) == "binary" - zero_one_treat = np.all((np.power(this_d, 2) - this_d) == 0) - is_binary[treatment_var] = binary_treat & zero_one_treat - return is_binary - - def _check_binary_outcome(self): - y = self.data.loc[:, self.y_col] - binary_outcome = type_of_target(y) == "binary" - zero_one_outcome = np.all((np.power(y, 2) - y) == 0) - is_binary = binary_outcome & zero_one_outcome - return is_binary - - @staticmethod - def _check_disjoint(set1, set2, name1, arg1, name2, arg2): - """Helper method to check for disjoint sets.""" - if not set1.isdisjoint(set2): - raise ValueError(f"At least one variable/column is set as {name1} ({arg1}) and {name2} ({arg2}).") + cluster_cols_set = set(self.cluster_cols or []) + return [cluster_cols_set, z_cols_set] def _check_disjoint_sets(self): # this function can be extended in inherited subclasses self._check_disjoint_sets_y_d_x() - self._check_disjoint_sets_z_t_s() + self._check_disjoint_sets_z() + if self.cluster_cols is not None: + self._check_disjoint_sets_cluster_cols() def _check_disjoint_sets_y_d_x(self): y_col_set = {self.y_col} @@ -782,14 +734,12 @@ def _check_disjoint_sets_y_d_x(self): "(``x_cols``). Consider using parameter ``use_other_treat_as_covariate``." ) - def _check_disjoint_sets_z_t_s(self): + def _check_disjoint_sets_z(self): y_col_set = {self.y_col} x_cols_set = set(self.x_cols) d_cols_set = set(self.d_cols) z_cols_set = set(self.z_cols or []) - t_col_set = {self.t_col} if self.t_col else set() - s_col_set = {self.s_col} if self.s_col else set() instrument_checks_args = [ (y_col_set, "outcome variable", "``y_col``"), @@ -801,12 +751,38 @@ def _check_disjoint_sets_z_t_s(self): set1=set1, name1=name, arg1=argument, set2=z_cols_set, name2="instrumental variable", arg2="``z_cols``" ) - time_check_args = instrument_checks_args + [(z_cols_set, "instrumental variable", "``z_cols``")] - for set1, name, argument in time_check_args: - self._check_disjoint(set1=set1, name1=name, arg1=argument, set2=t_col_set, name2="time variable", arg2="``t_col``") - - score_check_args = time_check_args + [(t_col_set, "time variable", "``t_col``")] - for set1, name, argument in score_check_args: + def _check_disjoint_sets_cluster_cols(self): + """Check that cluster columns are disjoint from other variable sets.""" + cluster_cols_set = set(self.cluster_cols) + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + z_cols_set = set(self.z_cols or []) + checks = [ + (y_col_set, "outcome variable", "``y_col``"), + (d_cols_set, "treatment variable", "``d_cols``"), + (x_cols_set, "covariate", "``x_cols``"), + (z_cols_set, "instrumental variable", "``z_cols``"), + ] + for set1, name, arg in checks: self._check_disjoint( - set1=set1, name1=name, arg1=argument, set2=s_col_set, name2="score or selection variable", arg2="``s_col``" + set1=set1, + name1=name, + arg1=arg, + set2=cluster_cols_set, + name2="cluster variable(s)", + arg2="``cluster_cols``", ) + + @property + def is_cluster_data(self): + """ + Flag indicating whether this data object is being used for cluster data. + """ + return self._is_cluster_data + + @is_cluster_data.setter + def is_cluster_data(self, value): + if not isinstance(value, bool): + raise TypeError(f"is_cluster_data must be True or False. Got {str(value)}.") + self._is_cluster_data = value diff --git a/doubleml/data/base_data_content.txt b/doubleml/data/base_data_content.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ccdf7ca9a7ea2f43ee36364c7123001ff431e8c GIT binary patch literal 60862 zcmeI5|8EsXlECNZlkR_bXMVtF7boofvdW1Q93*>4$tJoW6p4!*1BS2xlf#ezyZP7e zu0B&Ps=BAUr{DNJFvPN8?DwXt>w8sK*Yy0~|2`g`4nGgihUdeJVKqFK-~M^{r{SUe z{Y<`D4F|)$;r{qdyzlz(AH$8|k0;+B%h=~~=Y6?89S-IHA7%7|5faeSIgW;xr?`MH{Bv1|p?$Q5$K7s1^9S=VZPh`#m!Qg1v6WA|gz9-|DbY37F%bifV zFZVx@pM9b9v5b2xpB-kD*CFo>zl`G_kJvHaGl2p3B9HD17Y>CdKaI0JA35}TbTmAY-+w$A@k~C0Z%6Vi(%K)c1P(ht9>}*ZMP@*FD*s20?8&IZ zaUL)|5a?yh;SRidDf1l4m_Nx(d}cHtEBkj}=Fq!W!_6@FSHqX0BYblxpEK9Nusf*U zTpB(b$FGEE{OpWBcK1p&s|<5BVh#6@*`bVd$@9r0`2>AouFJv$H1a384=p-AnZ@w7 zxgW^)4`tlG{Fo;2`|gPO6ZsA;H>f%02IcVx<4EXND?*+}0s%~p1dBs~XE1<=9`C*7 z%NL_=U^zATHV;;Yra=4c;kCdy96lfAf==uT7CUd}fm<&}99F^$Mm!s_tj)a=Y!%|3 z-23-&Brpx{OT%w(Vc4DVKKm?=&@sOX*4Qu>tu_etJ3=keFxu>z^3rDC6E3<99Lsq4 zurCy1z5gTE!%V9To?RpF?g%E>Q_8)a;g&#i+pf#$Gr{_X;Dfay5oC>)yc8Or$lv$n zQ>=NFV2!6c5UF5;b^4JmdWW?d^{e6P2w!!(rNiorQw^?Zl|kPZeDE#!gy*6II$H^k z{7(MH_wQC@c~9VCU0{!|*&Dtc?}uZLgxg_Op@jz4z_5MAv3tYcWL{z|GtdA0w{UHziQOLr($2trmpgvIQuS~p)Ter5^sYMKTpnkf$IRrP z>t4xs*!)W=a!-uDsQBd7eD_+GMBVj?-->qO@*&Koq3 z?=|W*D=A6m_L$juRhHgI?NbohSaA1(+^weIeO( zZehsx*{EUUHk$pCMUZt99mu`;pA3|hnL{T#*Kb-Y2%sLy5BWQq1%zGs`*@`AzI>uO zPtF3qa9lMn>e&VPRI}kH^6g_8i&bzZmPjlkCnJ|DYdP5*xL%U)jZ4sqhTDvq(I_8* z%AI5+`f$MRTn5{+J)fZ^<m5H6TIdDz*hDQLma^1wDh{Y{< z#R`bkYLCQhtz{mOYs=DZ3LZ#vS3HAP5K61~Jc6kAj-yVqPFFcZef})q{n_xt1V6+4 z2eBXgtYyd8B9=?8r0ZX76pK@@@pyP6+{7lWdeWLTR-;NQ_aBCfA{XqN|9=(88a4Ir zoCdFf?TwxQOGXo33w^+ua^V~KR;`^lir-1CzLcFtdMskDMz5daGf!QFWx7f1z-t-o zDb}w~w3hsVXrmh6f`#tA7&RSho%-YH9MqjEff@EGz z<|ivAZmDMn_Byygi|1O%+VsKrZr59)xhv6_AI3bK8Tp-AH7ZWI2Zwe~xTV!ZKG7V7 zu_j4=C))iwW(QQa+F*N~es z?xASIOTm!rMl(I;GacpksZB*{)u!zJWjc-L?c;Z}GShRr_EKZ_*Z92⁣89z1ajE zC7p<$sa~)jVAdG(DXrqGmbpf1RT3{r^^6QZLNw0Z8l#feF|jFp0kOyI9+_IUJVkSV$fOVV0F--6h4$s_}y#7o>}7UScw>>FJ#V@e1et!B5?5U8fnSt@$`I${=mNo zk%|>Klx3&27q{ZJ9+l{1M3mt)V^>+Wti0|?qeV}1vyvYr*G)ZFj5X%1K3qRq$|XFf zNwe$+qZO4c@!Yc#0r4c~p4H>|wmd5ylRA+5Oy;zjI==0Uuh-Iu?LKOK z3qnHgPUBYT)f^SLE2g5BzvlkBrtUn)F1$ zo7kxpYnN0!Zwjr{Lf3=3Cp>y0aQ9?B9gnTO7-MESpY4!P!+#`iM&Lob%QV+^)^X;z@x zMRZlNmFb)pCD$v>tCd?;OGwC9O~f_g|I_g^!>uMUdXKbY>!M(Bam*?&%4fA%*WspR z%<$8ywIxb+MQ_Qbwbzf_c60pnnQIjF@X?_tW;Jf^1>XA@b{}#IBOiP>H-3sIM0(Rc zyLgxR3@k@`?@Hey0qpaSi6+@tuP2tjk9EQF9kHT&^=+lBd~d9j&E1=>w7vc}H*RU2 z=jz+FVHx*QCYi6um%zxg(K#6Hx=GsI74;$Ci82ky_1QX0%^R_M+Sl#LY;i`L!Sr)a ztlZ?Nw&Z`e$3-pFk@1uE*r;>Q~S0JGun&7iJ!E`wq>u-+V`~KBONN1 z+K0_b@mO{US#fKpwZ-UKkDp6=sTC>>OGme<>#<;+zvs^N=gKX;K39Hcd#-({0U#lw z4i@FvD_JxBmM)i6!u4_U63%DM_pkcrO-J_3ta`XVQu{_CY?>vR-_hEQ(_XVzGF&*t zP8!uF+r3Hq?hUG49WFhB`amalrg*@v)vOa$hza+6B!Y;K3$jOyYQ@2 zsP$Ogh*}0a+*&EqI##Por>#v!3ts7gVD5BApR4^+N>zjxGZ(F(oOf406dU~TyQ?i9 zIPa?V-v6li<`?Yx`{}ASis44&i2DLtgXCT5o9eD^KTkmK_198KZ*|I00p=+lT}j1E zwbgrVlrsZ$=(+w#-RI8dihi|swi^-wyr!#tFFD+pSN6iaB~eiQw)c9}_UrTcVqV>v zZyGg>kK2vdH7#5Xe;BhQqp8nx^uN*HM)y{0}uH0H9pKb|!x&~stPA7rj=O){El3#s{DtS86!amr7%(sg*u|Q2G3g zSb^of<&mkpJ-?kBw{$%>R}1M`r!9ezufOJCwDo)anv7QkP5ySpKTp>Kd53CV)44f% zmxFO9%e0Q}5!Y3Eh1c)1HLT|PZSB6fzFWKRl~^-AWE~7knr`r}i%-ywVRm<^td@FB zrn|>Bo)y}5@v*gBx)z?spE{20@NXN(`3#4>6XTj)-<)7(zQuOfwRtJ(6}~mbljwK~ou+^{Rc9U>a^c646T5htzLq)cqd1V~kes!}$374~=Gg z^Y5b%(I{45;QsMr?MF0v(`Zo223#>J=>mnlYNXv@rT>bXcU~*KeqZCZH~OUV4vG{JH3wm(6`(3nBy_+JJVi( zO>`=Hy)y5t%dvR>9d?FMy{4wS4%G0-2w2zH6BwKk$FocA=dh^L>uRXhV`Vt#Og%7H z!@rDs5<7y6^+4NGH#<`2X|20#wz#UgGs4NwaO;6$(5C0T_0iEjv&x{`m0i;wxa(l9 zJF(QI*2iV13mbi+6CJSA6KwU3Zum1wwstN*nZzk~J#>|ON!Ynm{W=;~F51U z^u(`ta&Wp=7@vUJAM4xs_a)C5pKZL_TRhosa ztvAxU_gJtcTf8D&S37c5&xp2b7d8FeoZ^6;v%cfp;$0azrTtRjj6TCaIhO9GdLPWX zwiUal=+@G%u}#@R9m|%5)JC1&*SGDaHgB&r)yB`w>d(8Z@896bvU%HjG}XytH z)h%o@zeBr8H*2r?961))ZhnvAvDBW+(CG5WJ>0yNbMAgVs#gx=oH@0;ZJ@^AK0jE3 z3J)JF_J1WBTbzkK#cG*%#nbK#-z~=ARe{T3U>?`57vvK=2bW3*&)2h0q^CRD^W1)x z$wT>uo?6XJ>7Atdl6HYxU46O(l|DglI9NBTQeJyE1g8oSX;l7ta-ep)>pM1fCGO%B3vsTfDN0k>y|Q`l6o5Zkbk{3z}=? zG{qR+_RS@31C0}LknFB-!|p-4hS&F_tPg~HWU5ysBPh?)_Q9v&P3nc_v?aaUvk8qZ zzHWh*{9ffQ=Sh_0Y?@$PikQrEGM%f&)zZB53ECM48XS zeP^_=_MK60+jpjXGi$ANZyRk(JLKB9lB4ljYe}pQFZH z6Q9c27VS3{yb?XLryy#T>-tX6b(w+H0ab%~pYXpGHTkBD;u%~{r@1e?H>=^R6G#ul z8o*jl(!zgvkInrtK4CAcru76h@b7(=vy97a!G1NaaDYRUx>D-h_QlGq-8)P}xSN_DYX}*(nvWvuCM2 z)PnhkVY@L(Z`XENC?8(H@w$WgKJMI)cu&|wubGV)VVwl zS(e$>JtH-=oV%9Q*tKtco6cdYk!8}#^9{?CPb0(I8jFu^9YE)CnAgiEmpPm*^gx4pc()`TkaDK5KaXHs404j7YUCC%P)|E%=N!Oi1 zj%YZv-+t&gEz@kZ6|z*bNbmn~*ihfsUjHJo`=+wbw54`=VzmzlBrzPDGCHs342heiI^|erG-d>sz3peYx{C$Nqd=Q^5IM z;oU94Kq2dS!>KQ-dC9sqd?py*7_%@s^3*n|@=V_w>husPYu>_WNjay!D&?3yKhLhV zK2Z)w?LMyh#!+3NSiQz`ZhZd|JA7As#WJpsD@^o(XVqCHlI`}`aP+bM#6ME>O4Ym! zoQw77re4!!Te6wHyrO?L^B;L+O?9Unz}fg;^J^qdy8I`2(s?^RyK>n zr-n@qF_joH;sh!?=PF97uWs}4yxq)vtn)3dbC%Y%)_tQ{JZ(hWP$p)$I z@xDN<_@F!boTk1#g#3;Zbm=4~pJa6vC2pEi$Dx+#rdp8OV813=9v4qjsd?l&mMu>C z48C?d3#&9Z=(wo|ooZxU4VJTvKEGO;?bD(zq!LfZEt7NW>_W4>^=1C8=reDFwdcp} z%?P~r;*or=bxB>{5eefn=loTB^9+%-r+Q?F;lBAv zqK9Sz;hT6_H z6-rO#;Jl_``AWRTn#NSCSBW-G`^k0h@VsgfIjX;>YwwtuoSmx0oV?HL6195I9OvdL z5Bd1$`r>5}eTlrUF#WN^$*oyfD8Hb)&>G-ngFu?s@O**IB!Do^7MQjPr(^tYJh97}KG zr_u$xv|bb;d4{|G%F1$kU;Eik^>m%YJe%D0j5MHcQRl8DyqeGWBP{2~_A~xCZ#(1P zT(0g`XTHKeDepU`Sy>Mzz59)E->CF-c$BGGN`CcRyk85+7TlJWX6XHUh-YQ*upFmT z`ED!gjBZw}JT7hXnwUJFmy%;yy)`nR7|_^j=m z!gKmf^)^KxwKSd+D=n{_lQ?gNv^YK&#jP6%-*y)Icv`X8J)G~$>&naRx87gUUQIsM-==4hFkz4|QCO3mAyrC1QDFyD*z`Wwn3 z1${eaai?|@9u^(by0q_Gqi0iJyXnW}C9 zH)gx}HTtckvBh9|gjrYPIW@x~4pp-Aj#T66>ZBUwf2Y5tYp89lGHuSNEq6Lso2S!- zO|LnPl9X>6Io|c!({P#RYK&nvs&<`b8K;v#d)3+xJNo2Sw$ejuImIzjh->(L?D-R&W_e#Mv01xSjd#metmQ7s_$19Vvs%9^Mx|ufvXc{Mh|$Wk z@cvvQ?YW75Tl?H=vz0Z!PixaTwP!F|TVHmZW3~y&UKicK3&v-_Be$pWT$U-nOPRXn z)UDwxKEt>4OaxyOtZ&)3WQFwmd5=)N`#H7Hs6k7mlEf~1HNlU4juCQ*4|K$V+d*08_&nEx- z%cw2&3Sa%%qfO}Wc56}>gudT%*Fw)}HEyBX@$Dbp8_AaV@E%T2{dLc%dEeXv#ihJ` zKiq>)+voOlO!W0xYgS0K)7|qA^>lZ7o8k#kJd3TT)kWI$bbq|MeOo?7eF`nF%2~`n zTRdJ^1bLtDTX1^8ZBr34F${gB51v_EaZMyVv#Y_)J!Q+SYV8OHa@uJQ*?gd4zjee#h>? z!dGLQzXDgZ6TGCM-^d;8-!eu|yDY!UA%9yif0y7*u`oXC``ERvR~l(rb^V01ysM&* zJCXmK`JyLb>*-6X6WqZW-|L^fq!!%XC92bm=Ji!FV<*wlZa(&v>bOjIj?!}%%KhMV zc>U)*wqPtj_puyVR9^KN>;2ArV`fdI<9GS1>rJz@90?_-T%>P30JeEA| z6ZzrnD=6ngOns94seF1X)=bDj#$c(aKaZ#wUaytntpm>%*TQ_d>88f4W&0bF8bT7DyXSsFc@LWg2~`(1D< zmH$!><(MzwklPDq5#*X!%W3OYQGUKW&hlONEIF63g=i0!C9ijuh~ZMplJSFhI$cFS z6knBQ#`tRTRyzyxmE6ranPB4{9S_bq7Mvx>_k5%039||?dn;$Aab;+$QkRUKMXbG< z@m|~Qyjxx0)K(yKv((1T&zP>UEpI+5d%-18{h2je#n?6XsWpkl9{c`YW8tm`rL+J<$ylExe3$>M3G(~5=e`Z~Sr0ekLEHqKrF{+@~jIkc{6xG&Pq z-*aml!_#Lo+10okWoP%Ep1URan{4dL%J3{mV49PZWau)995b1;=q=mV1(9=WmFO$N zG@J2wu5mnSi0<0CE=NpFtF*fHqiCIG7O_qUM!x#=6{o!;RV`SvnD$Du#1AM=v9*(j zQ#{qG()H(AT4APsEaGnVQ?s>5z1{n6Y;C9U-UjzRx*q@Jq(Y$YpS5VLF~q6^eKxZz z9*ce#_uu@@n`TY&D(k5aEbk$h=dkOJWG+CAeAOAKBi#@kr}E4mEcx_G{Is5CLB-`~ z!6B{ou0~i(YeLM6cP?qZ;X93u>*k_*CV*LZO?Rf=|KrOhO1ZxK#x&cB_fA(HC5hqP zv|8<_9mLF<_mZMNkH!pu>cZWo_cShZGxuR|Ky_Da9YWlXD z*j_qsj|IoMb1UwdJ=$eDd&%OJA5a#@wJ9xE4YVqxHN6+|7H1&q8&(g;-&JxhS&NP9 zNWER{F$J3>Nq7jJarT3B=}U zj-=Y6bLv=R7iUpfmh8u{O57?VvrnHuByEX3j+7YtE$uGv;3079vl|Kj1U$u7| z=}w8wPJFyK{7s@L{Y19zq~c~UbZti6!ZN&+5~45B`rxF?$F#(*^**L_w;|e6n`3Yy za^srd+T!qh2Y!3JO9&sPJ;v;b*W{JfDu8WXj}`cR85z$D0_LWkGo!lKre)fNb6YZ{ QZ9l&~hZ8Z|`HF1+ANk8VaR2}S literal 0 HcmV?d00001 diff --git a/doubleml/data/cluster_data.py b/doubleml/data/cluster_data.py index 89947b73..290c61f5 100644 --- a/doubleml/data/cluster_data.py +++ b/doubleml/data/cluster_data.py @@ -84,13 +84,11 @@ def __init__( use_other_treat_as_covariate=True, force_all_x_finite=True, ): - DoubleMLBaseData.__init__(self, data) - - # we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter + DoubleMLBaseData.__init__(self, data) # we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter self.cluster_cols = cluster_cols self._set_cluster_vars() DoubleMLData.__init__( - self, data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite + self, data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite, is_cluster_data=True ) self._check_disjoint_sets_cluster_cols() @@ -176,7 +174,7 @@ def from_arrays( >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) """ - dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite) + dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite, is_cluster_data=True) cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False) cluster_vars = _assure_2d_array(cluster_vars) if cluster_vars.shape[1] == 1: diff --git a/doubleml/data/did_data.py b/doubleml/data/did_data.py new file mode 100644 index 00000000..150aeb7d --- /dev/null +++ b/doubleml/data/did_data.py @@ -0,0 +1,272 @@ +import io +import numpy as np +import pandas as pd +from sklearn.utils.validation import check_array + +from doubleml.data.base_data import DoubleMLData +from doubleml.utils._estimation import _assure_2d_array + + +class DoubleMLDIDData(DoubleMLData): + """Double machine learning data-backend for Difference-in-Differences models. + + :class:`DoubleMLDIDData` objects can be initialized from + :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. + + Parameters + ---------- + data : :class:`pandas.DataFrame` + The data. + + y_col : str + The outcome variable. + + d_cols : str or list + The treatment variable(s). + + t_col : str + The time variable for DiD models. + + x_cols : None, str or list + The covariates. + If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor + treatment variables ``d_cols``, nor instrumental variables ``z_cols``, nor time variable ``t_col`` are used as covariates. + Default is ``None``. + + z_cols : None, str or list + The instrumental variable(s). + Default is ``None``. + + cluster_cols : None, str or list + The cluster variable(s). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + force_all_d_finite : bool + Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``. + Default is ``True``. + + Examples + -------- >>> from doubleml import DoubleMLDIDData + >>> from doubleml.did.datasets import make_did_SZ2020 + >>> # initialization from pandas.DataFrame + >>> df = make_did_SZ2020(return_type='DataFrame') + >>> obj_dml_data_from_df = DoubleMLDIDData(df, 'y', 'd', 't') + >>> # initialization from np.ndarray + >>> (x, y, d, t) = make_did_SZ2020(return_type='array') + >>> obj_dml_data_from_array = DoubleMLDIDData.from_arrays(x, y, d, t=t) + """ + + def __init__( + self, + data, + y_col, + d_cols, + t_col, + x_cols=None, + z_cols=None, + cluster_cols=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True, + force_all_d_finite=True, + ): + # Set time column before calling parent constructor + self.t_col = t_col + + # Call parent constructor + super().__init__( + data=data, + y_col=y_col, + d_cols=d_cols, + x_cols=x_cols, + z_cols=z_cols, + cluster_cols=cluster_cols, + use_other_treat_as_covariate=use_other_treat_as_covariate, + force_all_x_finite=force_all_x_finite, + force_all_d_finite=force_all_d_finite, + ) + + # Set time variable array after data is loaded + self._set_time_var() + + @classmethod + def from_arrays( + cls, + x, + y, + d, + t, + z=None, + cluster_vars=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True, + force_all_d_finite=True, + ): + """ + Initialize :class:`DoubleMLDIDData` object from :class:`numpy.ndarray`'s. + + Parameters + ---------- + x : :class:`numpy.ndarray` + Array of covariates. + + y : :class:`numpy.ndarray` + Array of the outcome variable. + + d : :class:`numpy.ndarray` + Array of treatment variables. + + t : :class:`numpy.ndarray` + Array of the time variable for DiD models. + + z : None or :class:`numpy.ndarray` + Array of instrumental variables. + Default is ``None``. + + cluster_vars : None or :class:`numpy.ndarray` + Array of cluster variables. + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + force_all_d_finite : bool + Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``. + Default is ``True``. + + Examples + -------- >>> from doubleml import DoubleMLDIDData + >>> from doubleml.did.datasets import make_did_SZ2020 + >>> (x, y, d, t) = make_did_SZ2020(return_type='array') + >>> obj_dml_data_from_array = DoubleMLDIDData.from_arrays(x, y, d, t=t) + """ + # Prepare time variable + t = check_array(t, ensure_2d=False, allow_nd=False) + t = _assure_2d_array(t) + if t.shape[1] != 1: + raise ValueError("t must be a single column.") + t_col = "t" + + # Create base data using parent class method + base_data = DoubleMLData.from_arrays( + x, y, d, z, cluster_vars, use_other_treat_as_covariate, force_all_x_finite, force_all_d_finite + ) + + # Add time variable to the DataFrame + data = pd.concat((base_data.data, pd.DataFrame(t, columns=[t_col])), axis=1) + + return cls( + data, + base_data.y_col, + base_data.d_cols, + t_col, + base_data.x_cols, + base_data.z_cols, + base_data.cluster_cols, + base_data.use_other_treat_as_covariate, + base_data.force_all_x_finite, + base_data.force_all_d_finite, + ) + + @property + def t_col(self): + """ + The time variable. + """ + return self._t_col + + @t_col.setter + def t_col(self, value): + if not isinstance(value, str): + raise TypeError( + "The time variable t_col must be of str type. " + f"{str(value)} of type {str(type(value))} was passed." + ) + # Check if data exists (during initialization it might not) + if hasattr(self, '_data') and value not in self.all_variables: + raise ValueError("Invalid time variable t_col. The time variable is no data column.") + self._t_col = value + # Update time variable array if data is already loaded + if hasattr(self, '_data'): + self._set_time_var() + + @property + def t(self): + """ + Array of time variable. + """ + return self._t.values + + def _get_optional_col_sets(self): + """Get optional column sets including time column.""" + base_optional_col_sets = super()._get_optional_col_sets() + t_col_set = {self.t_col} + return [t_col_set] + base_optional_col_sets + + def _check_disjoint_sets(self): + """Check that time column doesn't overlap with other variables.""" + # Apply standard checks from parent class + super()._check_disjoint_sets() + self._check_disjoint_sets_t_col() + + def _check_disjoint_sets_t_col(self): + """Check that time column is disjoint from other variable sets.""" + t_col_set = {self.t_col} + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + z_cols_set = set(self.z_cols or []) + cluster_cols_set = set(self.cluster_cols or []) + + t_checks_args = [ + (y_col_set, "outcome variable", "``y_col``"), + (d_cols_set, "treatment variable", "``d_cols``"), + (x_cols_set, "covariate", "``x_cols``"), + (z_cols_set, "instrumental variable", "``z_cols``"), + (cluster_cols_set, "cluster variable(s)", "``cluster_cols``"), + ] + for set1, name, argument in t_checks_args: + self._check_disjoint( + set1=set1, + name1=name, + arg1=argument, + set2=t_col_set, + name2="time variable", + arg2="``t_col``", + ) + + def _set_time_var(self): + """Set the time variable array.""" + if hasattr(self, '_data') and self.t_col in self.data.columns: + self._t = self.data.loc[:, [self.t_col]] + + def __str__(self): + """String representation.""" + data_summary = self._data_summary_str() + buf = io.StringIO() + print("================== DoubleMLDIDData Object ==================", file=buf) + print(f"Time variable: {self.t_col}", file=buf) + print(data_summary, file=buf) + return buf.getvalue() diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py index f548ae6a..f34b2ee1 100644 --- a/doubleml/data/panel_data.py +++ b/doubleml/data/panel_data.py @@ -83,15 +83,15 @@ def __init__( x_cols=None, z_cols=None, use_other_treat_as_covariate=True, - force_all_x_finite=True, - datetime_unit="M", + force_all_x_finite=True, datetime_unit="M", ): DoubleMLBaseData.__init__(self, data) # we need to set id_col (needs _data) before call to the super __init__ because of the x_cols setter self.id_col = id_col self._datetime_unit = _is_valid_datetime_unit(datetime_unit) - self._set_id_var() + self._set_id_var() # Set t_col first before calling parent constructor + self.t_col = t_col DoubleMLData.__init__( self, @@ -100,8 +100,6 @@ def __init__( d_cols=d_cols, x_cols=x_cols, z_cols=z_cols, - t_col=t_col, - s_col=None, use_other_treat_as_covariate=use_other_treat_as_covariate, force_all_x_finite=force_all_x_finite, force_all_d_finite=False, @@ -110,6 +108,7 @@ def __init__( raise ValueError("Only one treatment column is allowed for panel data.") self._check_disjoint_sets_id_col() + self._set_t() # intialize the unique values of g and t self._g_values = np.sort(np.unique(self.d)) # unique values of g @@ -217,9 +216,7 @@ def n_obs(self): """ The number of observations. For panel data, the number of unique values for id_col. """ - return len(self._id_var_unique) - - @property + return len(self._id_var_unique) @property def g_col(self): """ The treatment variable indicating the time of treatment exposure. @@ -235,8 +232,7 @@ def d_cols(self, value): @property def g_values(self): """ - The unique values of the treatment variable (groups) ``d``. - """ + The unique values of the treatment variable (groups) ``d``. """ return self._g_values @property @@ -246,13 +242,36 @@ def n_groups(self): """ return len(self.g_values) - @DoubleMLData.t_col.setter + @property + def t_col(self): + """ + The time variable. + """ + return self._t_col + + @t_col.setter def t_col(self, value): if value is None: raise TypeError("Invalid time variable t_col. Time variable required for panel data.") - super(self.__class__, self.__class__).t_col.__set__(self, value) - if hasattr(self, "_t_values"): - self._t_values = np.sort(np.unique(self.t)) # update unique values of t + reset_value = hasattr(self, "_t_col") + if not isinstance(value, str): + raise TypeError( + f"The time variable t_col must be of str type. {str(value)} of type {str(type(value))} was passed." + ) + if value not in self.all_variables: + raise ValueError(f"Invalid time variable t_col. {value} is no data column.") + self._t_col = value + if reset_value: + self._check_disjoint_sets() + self._set_t() + if hasattr(self, "_t_values"): + self._t_values = np.sort(np.unique(self.t)) # update unique values of t + + def _set_t(self): + """Set time variable.""" + if self.t_col is not None: + assert_all_finite(self.data.loc[:, self.t_col]) + self._t = self.data.loc[:, self.t_col] @property def t_values(self): @@ -271,7 +290,8 @@ def n_t_periods(self): def _get_optional_col_sets(self): base_optional_col_sets = super()._get_optional_col_sets() id_col_set = {self.id_col} - return [id_col_set] + base_optional_col_sets + t_col_set = {self.t_col} # t_col is not None for panel data + return [id_col_set, t_col_set] + base_optional_col_sets def _check_disjoint_sets(self): # apply the standard checks from the DoubleMLData class diff --git a/doubleml/data/rdd_data.py b/doubleml/data/rdd_data.py new file mode 100644 index 00000000..3798dd7e --- /dev/null +++ b/doubleml/data/rdd_data.py @@ -0,0 +1,272 @@ +import io +import numpy as np +import pandas as pd +from sklearn.utils.validation import check_array + +from doubleml.data.base_data import DoubleMLData +from doubleml.utils._estimation import _assure_2d_array + + +class DoubleMLRDDData(DoubleMLData): + """Double machine learning data-backend for Regression Discontinuity Design models. + + :class:`DoubleMLRDDData` objects can be initialized from + :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. + + Parameters + ---------- + data : :class:`pandas.DataFrame` + The data. + + y_col : str + The outcome variable. + + d_cols : str or list + The treatment variable(s). + + s_col : str + The score/running variable for RDD models. + + x_cols : None, str or list + The covariates. + If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor + treatment variables ``d_cols``, nor instrumental variables ``z_cols``, nor score variable ``s_col`` are used as covariates. + Default is ``None``. + + z_cols : None, str or list + The instrumental variable(s). + Default is ``None``. + + cluster_cols : None, str or list + The cluster variable(s). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + force_all_d_finite : bool + Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``. + Default is ``True``. + + Examples + -------- >>> from doubleml import DoubleMLRDDData + >>> from doubleml.rdd.datasets import make_rdd_data + >>> # initialization from pandas.DataFrame + >>> df = make_rdd_data(return_type='DataFrame') + >>> obj_dml_data_from_df = DoubleMLRDDData(df, 'y', 'd', 's') + >>> # initialization from np.ndarray + >>> (x, y, d, s) = make_rdd_data(return_type='array') + >>> obj_dml_data_from_array = DoubleMLRDDData.from_arrays(x, y, d, s=s) + """ + + def __init__( + self, + data, + y_col, + d_cols, + s_col, + x_cols=None, + z_cols=None, + cluster_cols=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True, + force_all_d_finite=True, + ): + # Set score column before calling parent constructor + self.s_col = s_col + + # Call parent constructor + super().__init__( + data=data, + y_col=y_col, + d_cols=d_cols, + x_cols=x_cols, + z_cols=z_cols, + cluster_cols=cluster_cols, + use_other_treat_as_covariate=use_other_treat_as_covariate, + force_all_x_finite=force_all_x_finite, + force_all_d_finite=force_all_d_finite, + ) + + # Set score variable array after data is loaded + self._set_score_var() + + @classmethod + def from_arrays( + cls, + x, + y, + d, + s, + z=None, + cluster_vars=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True, + force_all_d_finite=True, + ): + """ + Initialize :class:`DoubleMLRDDData` object from :class:`numpy.ndarray`'s. + + Parameters + ---------- + x : :class:`numpy.ndarray` + Array of covariates. + + y : :class:`numpy.ndarray` + Array of the outcome variable. + + d : :class:`numpy.ndarray` + Array of treatment variables. + + s : :class:`numpy.ndarray` + Array of the score/running variable for RDD models. + + z : None or :class:`numpy.ndarray` + Array of instrumental variables. + Default is ``None``. + + cluster_vars : None or :class:`numpy.ndarray` + Array of cluster variables. + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + force_all_d_finite : bool + Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``. + Default is ``True``. + + Examples + -------- >>> from doubleml import DoubleMLRDDData + >>> from doubleml.rdd.datasets import make_rdd_data + >>> (x, y, d, s) = make_rdd_data(return_type='array') + >>> obj_dml_data_from_array = DoubleMLRDDData.from_arrays(x, y, d, s=s) + """ + # Prepare score variable + s = check_array(s, ensure_2d=False, allow_nd=False) + s = _assure_2d_array(s) + if s.shape[1] != 1: + raise ValueError("s must be a single column.") + s_col = "s" + + # Create base data using parent class method + base_data = DoubleMLData.from_arrays( + x, y, d, z, cluster_vars, use_other_treat_as_covariate, force_all_x_finite, force_all_d_finite + ) + + # Add score variable to the DataFrame + data = pd.concat((base_data.data, pd.DataFrame(s, columns=[s_col])), axis=1) + + return cls( + data, + base_data.y_col, + base_data.d_cols, + s_col, + base_data.x_cols, + base_data.z_cols, + base_data.cluster_cols, + base_data.use_other_treat_as_covariate, + base_data.force_all_x_finite, + base_data.force_all_d_finite, + ) + + @property + def s_col(self): + """ + The score/running variable. + """ + return self._s_col + + @s_col.setter + def s_col(self, value): + if not isinstance(value, str): + raise TypeError( + "The score variable s_col must be of str type. " + f"{str(value)} of type {str(type(value))} was passed." + ) + # Check if data exists (during initialization it might not) + if hasattr(self, '_data') and value not in self.all_variables: + raise ValueError("Invalid score variable s_col. The score variable is no data column.") + self._s_col = value + # Update score variable array if data is already loaded + if hasattr(self, '_data'): + self._set_score_var() + + @property + def s(self): + """ + Array of score/running variable. + """ + return self._s.values + + def _get_optional_col_sets(self): + """Get optional column sets including score column.""" + base_optional_col_sets = super()._get_optional_col_sets() + s_col_set = {self.s_col} + return [s_col_set] + base_optional_col_sets + + def _check_disjoint_sets(self): + """Check that score column doesn't overlap with other variables.""" + # Apply standard checks from parent class + super()._check_disjoint_sets() + self._check_disjoint_sets_s_col() + + def _check_disjoint_sets_s_col(self): + """Check that score column is disjoint from other variable sets.""" + s_col_set = {self.s_col} + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + z_cols_set = set(self.z_cols or []) + cluster_cols_set = set(self.cluster_cols or []) + + s_checks_args = [ + (y_col_set, "outcome variable", "``y_col``"), + (d_cols_set, "treatment variable", "``d_cols``"), + (x_cols_set, "covariate", "``x_cols``"), + (z_cols_set, "instrumental variable", "``z_cols``"), + (cluster_cols_set, "cluster variable(s)", "``cluster_cols``"), + ] + for set1, name, argument in s_checks_args: + self._check_disjoint( + set1=set1, + name1=name, + arg1=argument, + set2=s_col_set, + name2="score variable", + arg2="``s_col``", + ) + + def _set_score_var(self): + """Set the score variable array.""" + if hasattr(self, '_data') and self.s_col in self.data.columns: + self._s = self.data.loc[:, [self.s_col]] + + def __str__(self): + """String representation.""" + data_summary = self._data_summary_str() + buf = io.StringIO() + print("================== DoubleMLRDDData Object ==================", file=buf) + print(f"Score variable: {self.s_col}", file=buf) + print(data_summary, file=buf) + return buf.getvalue() diff --git a/doubleml/data/ssm_data.py b/doubleml/data/ssm_data.py new file mode 100644 index 00000000..d8f3988e --- /dev/null +++ b/doubleml/data/ssm_data.py @@ -0,0 +1,274 @@ +import io +import numpy as np +import pandas as pd +from sklearn.utils.validation import check_array + +from doubleml.data.base_data import DoubleMLData +from doubleml.utils._estimation import _assure_2d_array + + +class DoubleMLSSMData(DoubleMLData): + """Double machine learning data-backend for Sample Selection Models. + + :class:`DoubleMLSSMData` objects can be initialized from + :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. + + Parameters + ---------- + data : :class:`pandas.DataFrame` + The data. + + y_col : str + The outcome variable. + + d_cols : str or list + The treatment variable(s). + + s_col : str + The selection variable for SSM models. + + x_cols : None, str or list + The covariates. + If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor + treatment variables ``d_cols``, nor instrumental variables ``z_cols``, nor selection variable ``s_col`` are used as covariates. + Default is ``None``. + + z_cols : None, str or list + The instrumental variable(s). + Default is ``None``. + + cluster_cols : None, str or list + The cluster variable(s). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + force_all_d_finite : bool + Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``. + Default is ``True``. + + Examples + -------- + >>> from doubleml import DoubleMLSSMData + >>> from doubleml.irm.datasets import make_ssm_data + >>> # initialization from pandas.DataFrame + >>> df = make_ssm_data(return_type='DataFrame') + >>> obj_dml_data_from_df = DoubleMLSSMData(df, 'y', 'd', 's') + >>> # initialization from np.ndarray + >>> (x, y, d, s) = make_ssm_data(return_type='array') + >>> obj_dml_data_from_array = DoubleMLSSMData.from_arrays(x, y, d, s=s) + """ + + def __init__( + self, + data, + y_col, + d_cols, + s_col, + x_cols=None, + z_cols=None, + cluster_cols=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True, + force_all_d_finite=True, + ): + # Set selection column before calling parent constructor + self.s_col = s_col + + # Call parent constructor + super().__init__( + data=data, + y_col=y_col, + d_cols=d_cols, + x_cols=x_cols, + z_cols=z_cols, + cluster_cols=cluster_cols, + use_other_treat_as_covariate=use_other_treat_as_covariate, + force_all_x_finite=force_all_x_finite, + force_all_d_finite=force_all_d_finite, + ) + + # Set selection variable array after data is loaded + self._set_selection_var() + + @classmethod + def from_arrays( + cls, + x, + y, + d, + s, + z=None, + cluster_vars=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True, + force_all_d_finite=True, + ): + """ + Initialize :class:`DoubleMLSSMData` object from :class:`numpy.ndarray`'s. + + Parameters + ---------- + x : :class:`numpy.ndarray` + Array of covariates. + + y : :class:`numpy.ndarray` + Array of the outcome variable. + + d : :class:`numpy.ndarray` + Array of treatment variables. + + s : :class:`numpy.ndarray` + Array of the selection variable for SSM models. + + z : None or :class:`numpy.ndarray` + Array of instrumental variables. + Default is ``None``. + + cluster_vars : None or :class:`numpy.ndarray` + Array of cluster variables. + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + force_all_d_finite : bool + Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``. + Default is ``True``. + + Examples + -------- + >>> from doubleml import DoubleMLSSMData + >>> from doubleml.irm.datasets import make_ssm_data + >>> (x, y, d, s) = make_ssm_data(return_type='array') + >>> obj_dml_data_from_array = DoubleMLSSMData.from_arrays(x, y, d, s=s) + """ + # Prepare selection variable + s = check_array(s, ensure_2d=False, allow_nd=False) + s = _assure_2d_array(s) + if s.shape[1] != 1: + raise ValueError("s must be a single column.") + s_col = "s" + + # Create base data using parent class method + base_data = DoubleMLData.from_arrays( + x, y, d, z, cluster_vars, use_other_treat_as_covariate, force_all_x_finite, force_all_d_finite + ) + + # Add selection variable to the DataFrame + data = pd.concat((base_data.data, pd.DataFrame(s, columns=[s_col])), axis=1) + + return cls( + data, + base_data.y_col, + base_data.d_cols, + s_col, + base_data.x_cols, + base_data.z_cols, + base_data.cluster_cols, + base_data.use_other_treat_as_covariate, + base_data.force_all_x_finite, + base_data.force_all_d_finite, + ) + + @property + def s_col(self): + """ + The selection variable. + """ + return self._s_col + + @s_col.setter + def s_col(self, value): + if not isinstance(value, str): + raise TypeError( + "The selection variable s_col must be of str type. " + f"{str(value)} of type {str(type(value))} was passed." + ) + # Check if data exists (during initialization it might not) + if hasattr(self, '_data') and value not in self.all_variables: + raise ValueError("Invalid selection variable s_col. The selection variable is no data column.") + self._s_col = value + # Update selection variable array if data is already loaded + if hasattr(self, '_data'): + self._set_selection_var() + + @property + def s(self): + """ + Array of selection variable. + """ + return self._s.values + + def _get_optional_col_sets(self): + """Get optional column sets including selection column.""" + base_optional_col_sets = super()._get_optional_col_sets() + s_col_set = {self.s_col} + return [s_col_set] + base_optional_col_sets + + def _check_disjoint_sets(self): + """Check that selection column doesn't overlap with other variables.""" + # Apply standard checks from parent class + super()._check_disjoint_sets() + self._check_disjoint_sets_s_col() + + def _check_disjoint_sets_s_col(self): + """Check that selection column is disjoint from other variable sets.""" + s_col_set = {self.s_col} + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + z_cols_set = set(self.z_cols or []) + cluster_cols_set = set(self.cluster_cols or []) + + s_checks_args = [ + (y_col_set, "outcome variable", "``y_col``"), + (d_cols_set, "treatment variable", "``d_cols``"), + (x_cols_set, "covariate", "``x_cols``"), + (z_cols_set, "instrumental variable", "``z_cols``"), + (cluster_cols_set, "cluster variable(s)", "``cluster_cols``"), + ] + for set1, name, argument in s_checks_args: + self._check_disjoint( + set1=set1, + name1=name, + arg1=argument, + set2=s_col_set, + name2="selection variable", + arg2="``s_col``", + ) + + def _set_selection_var(self): + """Set the selection variable array.""" + if hasattr(self, '_data') and self.s_col in self.data.columns: + self._s = self.data.loc[:, [self.s_col]] + + def __str__(self): + """String representation.""" + data_summary = self._data_summary_str() + buf = io.StringIO() + print("================== DoubleMLSSMData Object ==================", file=buf) + print(f"Selection variable: {self.s_col}", file=buf) + print(data_summary, file=buf) + return buf.getvalue() diff --git a/doubleml/data/tests/test_cluster_data.py b/doubleml/data/tests/test_cluster_data.py index b02a3275..09a45ccd 100644 --- a/doubleml/data/tests/test_cluster_data.py +++ b/doubleml/data/tests/test_cluster_data.py @@ -2,20 +2,22 @@ import pandas as pd import pytest -from doubleml import DoubleMLClusterData +from doubleml import DoubleMLData from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 @pytest.mark.ci def test_obj_vs_from_arrays(): np.random.seed(3141) - dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) - dml_data_from_array = DoubleMLClusterData.from_arrays( + (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="array") + dml_data = DoubleMLData.from_arrays(x, y, d, z=z, cluster_vars=cluster_vars, is_cluster_data=True) + dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], - dml_data.data[dml_data.cluster_cols], - dml_data.data[dml_data.z_cols], + z=dml_data.data[dml_data.z_cols], + cluster_vars=dml_data.data[dml_data.cluster_cols], + is_cluster_data=True ) df = dml_data.data.copy() df.rename( @@ -24,12 +26,13 @@ def test_obj_vs_from_arrays(): assert dml_data_from_array.data.equals(df) # with a single cluster variable - dml_data_from_array = DoubleMLClusterData.from_arrays( + dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], - dml_data.data[dml_data.cluster_cols[1]], - dml_data.data[dml_data.z_cols], + z=dml_data.data[dml_data.z_cols], + cluster_vars=dml_data.data[dml_data.cluster_cols[1]], + is_cluster_data=True ) df = dml_data.data.copy().drop(columns="cluster_var_i") df.rename(columns={"cluster_var_j": "cluster_var", "Y": "y", "D": "d", "Z": "z"}, inplace=True) @@ -39,7 +42,7 @@ def test_obj_vs_from_arrays(): @pytest.mark.ci def test_x_cols_setter_defaults_w_cluster(): df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "xx3", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1") + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", is_cluster_data=True) assert dml_data.x_cols == ["xx1", "xx2", "xx3"] dml_data.x_cols = ["xx1", "xx3"] assert dml_data.x_cols == ["xx1", "xx3"] @@ -48,48 +51,53 @@ def test_x_cols_setter_defaults_w_cluster(): # with instrument df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "z", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="z") + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="z", is_cluster_data=True) assert dml_data.x_cols == ["xx1", "xx2"] # without instrument and with time df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt") - assert dml_data.x_cols == ["xx1", "xx2"] + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", is_cluster_data=True) + assert dml_data.x_cols == ["xx1", "xx2", "tt"] # with instrument and with time df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt") - assert dml_data.x_cols == ["xx1", "xx2"] + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", is_cluster_data=True) + assert dml_data.x_cols == ["xx1", "xx2", "tt"] # without instrument and with selection df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "ss", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", s_col="ss") - assert dml_data.x_cols == ["xx1", "xx2"] + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", is_cluster_data=True) + assert dml_data.x_cols == ["xx1", "xx2", "ss"] # with instrument and with selection df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "ss", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", s_col="ss") - assert dml_data.x_cols == ["xx1", "xx2"] + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", is_cluster_data=True) + assert dml_data.x_cols == ["xx1", "xx2", "ss"] # without instrument with time with selection df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "ss", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt", s_col="ss") - assert dml_data.x_cols == ["xx1", "xx2"] + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", is_cluster_data=True) + assert dml_data.x_cols == ["xx1", "xx2", "tt", "ss"] # with instrument with time with selection df = pd.DataFrame(np.tile(np.arange(8), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "ss", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt", s_col="ss") - assert dml_data.x_cols == ["xx1", "xx2"] + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", is_cluster_data=True) + assert dml_data.x_cols == ["xx1", "xx2", "tt", "ss"] @pytest.mark.ci def test_cluster_cols_setter(): np.random.seed(3141) - dml_data = make_plr_CCDDHNR2018(n_obs=100) - df = dml_data.data.copy().iloc[:, :10] - df.columns = [f"X{i + 1}" for i in np.arange(7)] + ["y", "d1", "d2"] - dml_data = DoubleMLClusterData( - df, "y", ["d1", "d2"], cluster_cols=[f"X{i + 1}" for i in [5, 6]], x_cols=[f"X{i + 1}" for i in np.arange(5)] + (x, y, d) = make_plr_CCDDHNR2018(n_obs=100, return_type="array") + # Create a pandas DataFrame with X, y, and d columns + df = pd.DataFrame(np.column_stack((x[:, :7], y, d)), + columns=[f"X{i + 1}" for i in np.arange(7)] + ["y", "d1", "d2"]) + + dml_data = DoubleMLData( + df, "y", ["d1", "d2"], + x_cols=[f"X{i + 1}" for i in np.arange(5)], + cluster_cols=[f"X{i + 1}" for i in [5, 6]], + is_cluster_data=True ) cluster_vars = df[["X6", "X7"]].values @@ -129,56 +137,49 @@ def test_disjoint_sets(): r"and cluster variable\(s\) \(``cluster_cols``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="yy") + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="yy", is_cluster_data=True) msg = ( r"At least one variable/column is set as treatment variable \(``d_cols``\) " r"and cluster variable\(s\) \(``cluster_cols``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="dd1") + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="dd1", is_cluster_data=True) msg = ( r"At least one variable/column is set as covariate \(``x_cols``\) " r"and cluster variable\(s\) \(``cluster_cols``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="xx2") + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="xx2", is_cluster_data=True) msg = ( r"At least one variable/column is set as instrumental variable \(``z_cols``\) " r"and cluster variable\(s\) \(``cluster_cols``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], z_cols=["xx2"], cluster_cols="xx2") - - msg = ( - r"At least one variable/column is set as time variable \(``t_col``\) " - r"and cluster variable\(s\) \(``cluster_cols``\)." - ) - with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], t_col="xx2", cluster_cols="xx2") - - msg = ( - r"At least one variable/column is set as score or selection variable \(``s_col``\) " - r"and cluster variable\(s\) \(``cluster_cols``\)." - ) - with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], s_col="xx2", cluster_cols="xx2") + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], z_cols=["xx2"], cluster_cols="xx2", is_cluster_data=True) @pytest.mark.ci def test_duplicates(): np.random.seed(3141) - dml_cluster_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) + (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="array") + df = pd.DataFrame(np.column_stack((x, y, d, z)), + columns=[f"X{i+1}" for i in range(x.shape[1])] + ["Y", "D", "Z"]) + cluster_df = pd.DataFrame(cluster_vars, columns=["cluster_var_i", "cluster_var_j"]) + data = pd.concat([df, cluster_df], axis=1) msg = r"Invalid cluster variable\(s\) cluster_cols: Contains duplicate values." with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(dml_cluster_data.data, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2", "X3"]) + _ = DoubleMLData(data, y_col="Y", d_cols=["D"], cluster_cols=["X3", "X2", "X3"], is_cluster_data=True) + + dml_data = DoubleMLData(data, y_col="Y", d_cols=["D"], cluster_cols=["X3", "X2"], is_cluster_data=True) with pytest.raises(ValueError, match=msg): - dml_cluster_data.cluster_cols = ["X3", "X2", "X3"] + dml_data.cluster_cols = ["X3", "X2", "X3"] msg = "Invalid pd.DataFrame: Contains duplicate column names." with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData( - pd.DataFrame(np.zeros((100, 5)), columns=["y", "d", "X3", "X2", "y"]), y_col="y", d_cols=["d"], cluster_cols=["X2"] + _ = DoubleMLData( + pd.DataFrame(np.zeros((100, 5)), columns=["y", "d", "X3", "X2", "y"]), + y_col="y", d_cols=["d"], cluster_cols=["X2"], is_cluster_data=True ) @@ -186,45 +187,29 @@ def test_duplicates(): def test_dml_datatype(): data_array = np.zeros((100, 10)) with pytest.raises(TypeError): - _ = DoubleMLClusterData(data_array, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2"]) + _ = DoubleMLData(data_array, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2"], is_cluster_data=True) @pytest.mark.ci def test_cluster_data_str(): np.random.seed(3141) - dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) + (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="array") + dml_data = DoubleMLData.from_arrays(x, y, d, z=z, cluster_vars=cluster_vars, is_cluster_data=True) # Convert the object to string dml_str = str(dml_data) # Check that all important sections are present in the string - assert "================== DoubleMLClusterData Object ==================" in dml_str + assert "================== DoubleMLData Object ==================" in dml_str assert "------------------ Data summary ------------------" in dml_str assert "------------------ DataFrame info ------------------" in dml_str # Check that specific data attributes are correctly included - assert "Outcome variable: Y" in dml_str - assert "Treatment variable(s): ['D']" in dml_str - assert "Cluster variable(s): ['cluster_var_i', 'cluster_var_j']" in dml_str + assert "Outcome variable: y" in dml_str + assert "Treatment variable(s): ['d']" in dml_str + assert "Cluster variable(s): ['cluster_var1', 'cluster_var2']" in dml_str assert "Covariates: " in dml_str - assert "Instrument variable(s): ['Z']" in dml_str - assert "No. Observations:" in dml_str - - # Test with additional optional attributes - df = dml_data.data.copy() - df["time_var"] = 1 - df["score_var"] = 0.5 - - dml_data_with_optional = DoubleMLClusterData( - data=df, - y_col="Y", - d_cols="D", - cluster_cols=["cluster_var_i", "cluster_var_j"], - z_cols="Z", - t_col="time_var", - s_col="score_var", - ) - - dml_str_optional = str(dml_data_with_optional) - assert "Time variable: time_var" in dml_str_optional - assert "Score/Selection variable: score_var" in dml_str_optional + assert "Instrument variable(s): ['z']" in dml_str + assert "Is cluster data: True" in dml_str + assert "No. Observations:" in dml_str # There's no TimeData or ScoreData here anymore, so the test is complete + # The specialized data classes will be tested in their own test files diff --git a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py index df2b4cbe..a882c678 100644 --- a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py +++ b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py @@ -2,7 +2,7 @@ import pandas as pd from scipy.linalg import toeplitz -from doubleml.data import DoubleMLClusterData +from doubleml.data import DoubleMLData from doubleml.utils._aliases import _array_alias, _data_frame_alias, _dml_cluster_data_alias @@ -184,9 +184,7 @@ def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return y = d * theta + np.matmul(x, zeta_0) + eps cluster_cols = ["cluster_var_i", "cluster_var_j"] - cluster_vars = pd.MultiIndex.from_product([range(N), range(M)]).to_frame(name=cluster_cols).reset_index(drop=True) - - if return_type in _array_alias: + cluster_vars = pd.MultiIndex.from_product([range(N), range(M)]).to_frame(name=cluster_cols).reset_index(drop=True) if return_type in _array_alias: return x, y, d, cluster_vars.values, z elif return_type in _data_frame_alias + _dml_cluster_data_alias: x_cols = [f"X{i + 1}" for i in np.arange(dim_X)] @@ -194,6 +192,6 @@ def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return if return_type in _data_frame_alias: return data else: - return DoubleMLClusterData(data, "Y", "D", cluster_cols, x_cols, "Z") + return DoubleMLData(data, "Y", "D", x_cols, "Z", cluster_cols, is_cluster_data=True) else: raise ValueError("Invalid return_type.") diff --git a/doubleml/tests/test_multiway_cluster.py b/doubleml/tests/test_multiway_cluster.py index 10e5d445..c3425239 100644 --- a/doubleml/tests/test_multiway_cluster.py +++ b/doubleml/tests/test_multiway_cluster.py @@ -18,9 +18,10 @@ M = 25 # number of observations (second dimension) dim_x = 100 # dimension of x -obj_dml_cluster_data = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x) +(x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x, return_type="array") +obj_dml_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, z=z, cluster_vars=cluster_vars, is_cluster_data=True) -obj_dml_oneway_cluster_data = make_pliv_multiway_cluster_CKMS2021( +(x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021( N, M, dim_x, @@ -28,9 +29,11 @@ omega_epsilon=np.array([0.25, 0]), omega_v=np.array([0.25, 0]), omega_V=np.array([0.25, 0]), + return_type="array" ) +obj_dml_oneway_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, z=z, cluster_vars=cluster_vars, is_cluster_data=True) # only the first cluster variable is relevant with the weight setting above -obj_dml_oneway_cluster_data.cluster_cols = "cluster_var_i" +obj_dml_oneway_cluster_data.cluster_cols = "cluster_var1" @pytest.fixture( diff --git a/doubleml/tests/test_nonlinear_cluster.py b/doubleml/tests/test_nonlinear_cluster.py index 71998941..9a2c585a 100644 --- a/doubleml/tests/test_nonlinear_cluster.py +++ b/doubleml/tests/test_nonlinear_cluster.py @@ -7,7 +7,7 @@ from sklearn.linear_model import Lasso, LinearRegression import doubleml as dml -from doubleml import DoubleMLClusterData +from doubleml import DoubleMLData from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 from .test_nonlinear_score_mixin import DoubleMLPLRWithNonLinearScoreMixin @@ -20,7 +20,7 @@ # create data without insturment for plr x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x, return_type="array") -obj_dml_cluster_data = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars) +obj_dml_cluster_data = DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars, is_cluster_data=True) x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021( N, @@ -32,7 +32,7 @@ omega_V=np.array([0.25, 0]), return_type="array", ) -obj_dml_oneway_cluster_data = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars) +obj_dml_oneway_cluster_data = DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars, is_cluster_data=True) # only the first cluster variable is relevant with the weight setting above obj_dml_oneway_cluster_data.cluster_cols = "cluster_var1" @@ -188,15 +188,14 @@ def dml_plr_cluster_nonlinear_with_index(generate_data1, learner): # Set machine learning methods for m & l ml_l = clone(learner) - ml_m = clone(learner) - + ml_m = clone(learner) obj_dml_data = dml.DoubleMLData(data, "y", ["d"], x_cols) np.random.seed(3141) dml_plr_obj = DoubleMLPLRWithNonLinearScoreMixin(obj_dml_data, ml_l, ml_m, n_folds=n_folds) dml_plr_obj.fit() - + df = data.reset_index() - dml_cluster_data = dml.DoubleMLClusterData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index") + dml_cluster_data = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index", is_cluster_data=True) np.random.seed(3141) dml_plr_cluster_obj = DoubleMLPLRWithNonLinearScoreMixin(dml_cluster_data, ml_l, ml_m, n_folds=n_folds) dml_plr_cluster_obj.fit() diff --git a/doubleml/tests/test_sensitivity_cluster.py b/doubleml/tests/test_sensitivity_cluster.py index 83f8c270..a4b46e1a 100644 --- a/doubleml/tests/test_sensitivity_cluster.py +++ b/doubleml/tests/test_sensitivity_cluster.py @@ -17,7 +17,7 @@ (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x, return_type="array") -obj_dml_cluster_data = dml.DoubleMLClusterData.from_arrays(x, y, d, cluster_vars) +obj_dml_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, z=None, cluster_vars=cluster_vars, is_cluster_data=True) (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021( N, @@ -29,7 +29,7 @@ omega_V=np.array([0.25, 0]), return_type="array", ) -obj_dml_oneway_cluster_data = dml.DoubleMLClusterData.from_arrays(x, y, d, cluster_vars) +obj_dml_oneway_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, z=None, cluster_vars=cluster_vars, is_cluster_data=True) # only the first cluster variable is relevant with the weight setting above obj_dml_oneway_cluster_data.cluster_cols = "cluster_var1" diff --git a/doubleml/utils/_aliases.py b/doubleml/utils/_aliases.py index e52a5818..679c80d3 100644 --- a/doubleml/utils/_aliases.py +++ b/doubleml/utils/_aliases.py @@ -1,12 +1,13 @@ import numpy as np import pandas as pd -from doubleml.data import DoubleMLClusterData, DoubleMLData +from doubleml.data import DoubleMLData _array_alias = ["array", "np.ndarray", "np.array", np.ndarray] _data_frame_alias = ["DataFrame", "pd.DataFrame", pd.DataFrame] _dml_data_alias = ["DoubleMLData", DoubleMLData] -_dml_cluster_data_alias = ["DoubleMLClusterData", DoubleMLClusterData] +# For backwards compatibility, DoubleMLClusterData is now an alias for DoubleMLData with is_cluster_data=True +_dml_cluster_data_alias = ["DoubleMLClusterData", "DoubleMLData"] def _get_array_alias(): From a2566cbb1d8138885091e2f7516919a5aef1d3d5 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 4 Jun 2025 23:01:11 +0200 Subject: [PATCH 21/84] upd --- doubleml/data/__init__.py | 71 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/doubleml/data/__init__.py b/doubleml/data/__init__.py index dfe673e7..4c235a57 100644 --- a/doubleml/data/__init__.py +++ b/doubleml/data/__init__.py @@ -2,14 +2,85 @@ The :mod:`doubleml.data` module implements data classes for double machine learning. """ +import warnings + from .base_data import DoubleMLData from .did_data import DoubleMLDIDData from .panel_data import DoubleMLPanelData from .rdd_data import DoubleMLRDDData from .ssm_data import DoubleMLSSMData + +class DoubleMLClusterData(DoubleMLData): + """ + Backwards compatibility wrapper for DoubleMLData with is_cluster_data=True. + + This class is deprecated and will be removed in a future version. + Use DoubleMLData with is_cluster_data=True instead. + """ + + def __init__( + self, + data, + y_col, + d_cols, + cluster_cols, + x_cols=None, + z_cols=None, + t_col=None, + s_col=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True, + ): + warnings.warn( + "DoubleMLClusterData is deprecated. " + "Use DoubleMLData with is_cluster_data=True instead.", + FutureWarning, + stacklevel=2, + ) + super().__init__( + data=data, + y_col=y_col, + d_cols=d_cols, + x_cols=x_cols, + z_cols=z_cols, + cluster_cols=cluster_cols, + use_other_treat_as_covariate=use_other_treat_as_covariate, + force_all_x_finite=force_all_x_finite, + force_all_d_finite=True, + is_cluster_data=True, + ) + + @classmethod + def from_arrays( + cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True + ): + """ + Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s. + This method is deprecated, use DoubleMLData.from_arrays with is_cluster_data=True instead. + """ + warnings.warn( + "DoubleMLClusterData is deprecated. " + "Use DoubleMLData.from_arrays with is_cluster_data=True instead.", + FutureWarning, + stacklevel=2, + ) + return DoubleMLData.from_arrays( + x=x, + y=y, + d=d, + z=z, + cluster_vars=cluster_vars, + use_other_treat_as_covariate=use_other_treat_as_covariate, + force_all_x_finite=force_all_x_finite, + force_all_d_finite=True, + is_cluster_data=True, + ) + + __all__ = [ "DoubleMLData", + "DoubleMLClusterData", "DoubleMLDIDData", "DoubleMLPanelData", "DoubleMLRDDData", From 9ef4e53f975c5feb577aeab59ce67f63530640cd Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Thu, 5 Jun 2025 06:57:05 +0200 Subject: [PATCH 22/84] update lambda and p calculation in did_cs --- doubleml/did/did_cs.py | 8 ++------ doubleml/did/tests/_utils_did_cs_manual.py | 8 ++++---- doubleml/did/tests/_utils_did_manual.py | 2 +- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py index ab2af5b9..5984399c 100644 --- a/doubleml/did/did_cs.py +++ b/doubleml/did/did_cs.py @@ -219,14 +219,10 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa # THIS DIFFERS FROM THE PAPER due to stratified splitting this should be the same for each fold # nuisance estimates of the uncond. treatment prob. - p_hat = np.full_like(d, np.nan, dtype="float64") - for train_index, test_index in smpls: - p_hat[test_index] = np.mean(d[train_index]) + p_hat = np.full_like(d, d.mean(), dtype="float64") # nuisance estimates of the uncond. time prob. - lambda_hat = np.full_like(t, np.nan, dtype="float64") - for train_index, test_index in smpls: - lambda_hat[test_index] = np.mean(t[train_index]) + lambda_hat = np.full_like(t, t.mean(), dtype="float64") # nuisance g smpls_d0_t0, smpls_d0_t1, smpls_d1_t0, smpls_d1_t1 = _get_cond_smpls_2d(smpls, d, t) diff --git a/doubleml/did/tests/_utils_did_cs_manual.py b/doubleml/did/tests/_utils_did_cs_manual.py index f14a52a0..ce6f8870 100644 --- a/doubleml/did/tests/_utils_did_cs_manual.py +++ b/doubleml/did/tests/_utils_did_cs_manual.py @@ -178,12 +178,12 @@ def fit_nuisance_did_cs( m_hat_list.append(np.zeros_like(g_hat_d1_t1_list[idx], dtype="float64")) p_hat_list = [] - for train_index, _ in smpls: - p_hat_list.append(np.mean(d[train_index])) + for _ in smpls: + p_hat_list.append(np.mean(d)) lambda_hat_list = [] - for train_index, _ in smpls: - lambda_hat_list.append(np.mean(t[train_index])) + for _ in smpls: + lambda_hat_list.append(np.mean(t)) return g_hat_d0_t0_list, g_hat_d0_t1_list, g_hat_d1_t0_list, g_hat_d1_t1_list, m_hat_list, p_hat_list, lambda_hat_list diff --git a/doubleml/did/tests/_utils_did_manual.py b/doubleml/did/tests/_utils_did_manual.py index e314c301..b067e44d 100644 --- a/doubleml/did/tests/_utils_did_manual.py +++ b/doubleml/did/tests/_utils_did_manual.py @@ -104,7 +104,7 @@ def fit_nuisance_did( m_hat_list = fit_predict_proba(d, x, ml_m, m_params, smpls, trimming_threshold=trimming_threshold) p_hat_list = [] - for train_index, _ in smpls: + for _ in smpls: p_hat_list.append(np.mean(d)) return g_hat0_list, g_hat1_list, m_hat_list, p_hat_list From e90441b9366f2c46daecd01809575635a56faeb8 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Thu, 5 Jun 2025 11:18:11 +0200 Subject: [PATCH 23/84] add _score_dim property to doubleml class --- doubleml/double_ml.py | 57 +++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 911487a3..c2d3727b 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -103,16 +103,7 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): self._score_dim = (self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs) # initialize arrays according to obj_dml_data and the resampling settings - ( - self._psi, - self._psi_deriv, - self._psi_elements, - self._var_scaling_factors, - self._coef, - self._se, - self._all_coef, - self._all_se, - ) = self._initialize_arrays() + self._initialize_arrays() # initialize instance attributes which are later used for iterating self._i_rep = None @@ -1075,22 +1066,20 @@ def _fit_sensitivity_elements(self, nuisance_predictions): def _initialize_arrays(self): # scores - psi = np.full(self._score_dim, np.nan) - psi_deriv = np.full(self._score_dim, np.nan) - psi_elements = self._initialize_score_elements(self._score_dim) + self._psi = np.full(self._score_dim, np.nan) + self._psi_deriv = np.full(self._score_dim, np.nan) + self._psi_elements = self._initialize_score_elements(self._score_dim) n_rep = self._score_dim[1] n_thetas = self._score_dim[2] - var_scaling_factors = np.full(n_thetas, np.nan) + self._var_scaling_factors = np.full(n_thetas, np.nan) # coefficients and ses - coef = np.full(n_thetas, np.nan) - se = np.full(n_thetas, np.nan) + self._coef = np.full(n_thetas, np.nan) + self._se = np.full(n_thetas, np.nan) - all_coef = np.full((n_thetas, n_rep), np.nan) - all_se = np.full((n_thetas, n_rep), np.nan) - - return psi, psi_deriv, psi_elements, var_scaling_factors, coef, se, all_coef, all_se + self._all_coef = np.full((n_thetas, n_rep), np.nan) + self._all_se = np.full((n_thetas, n_rep), np.nan) def _initialize_predictions_and_targets(self): self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names} @@ -1211,7 +1200,7 @@ def evaluate_learners(self, learners=None, metric=_rmse): f"The learners have to be a subset of {str(self.params_names)}. Learners {str(learners)} provided." ) - def draw_sample_splitting(self): + def draw_sample_splitting(self, n_obs=None): """ Draw sample splitting for DoubleML models. @@ -1221,26 +1210,27 @@ def draw_sample_splitting(self): Parameters ---------- n_obs : int or None - The number of observations. If ``None``, the number of observations is set to the number of observations in - the data set. + The number of observations to resample. If ``None``, the number of observations is set to the number + of observations in the data set. Returns ------- self : object """ + if n_obs is None: + n_obs = self.n_obs + if self._is_cluster_data: obj_dml_resampling = DoubleMLClusterResampling( n_folds=self._n_folds_per_cluster, n_rep=self.n_rep, - n_obs=self.n_obs, + n_obs=n_obs, n_cluster_vars=self._dml_data.n_cluster_vars, cluster_vars=self._dml_data.cluster_vars, ) self._smpls, self._smpls_cluster = obj_dml_resampling.split_samples() else: - obj_dml_resampling = DoubleMLResampling( - n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self.n_obs, stratify=self._strata - ) + obj_dml_resampling = DoubleMLResampling(n_folds=self.n_folds, n_rep=self.n_rep, n_obs=n_obs, stratify=self._strata) self._smpls = obj_dml_resampling.split_samples() return self @@ -1309,16 +1299,9 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data, n_obs=self.n_obs ) - ( - self._psi, - self._psi_deriv, - self._psi_elements, - self._var_scaling_factors, - self._coef, - self._se, - self._all_coef, - self._all_se, - ) = self._initialize_arrays() + # set sample splitting can update the number of repetitions + self._score_dim = (self._score_dim[0], self._n_rep, self._score_dim[2]) + self._initialize_arrays() self._initialize_ml_nuisance_params() return self From eb19efef0278c54831bc5b71567b96cf2f90e7da Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 5 Jun 2025 14:04:14 +0200 Subject: [PATCH 24/84] upd 305 --- doubleml/__init__.py | 6 +- doubleml/data/__init__.py | 29 +- doubleml/data/base_data_content.txt | Bin 60862 -> 0 bytes doubleml/data/cluster_data.py | 285 ------------------ doubleml/data/did_data.py | 69 +++-- doubleml/data/panel_data.py | 89 ++++-- doubleml/data/rdd_data.py | 82 ++--- doubleml/data/ssm_data.py | 13 +- doubleml/data/tests/test_cluster_data.py | 127 ++++---- doubleml/data/tests/test_dml_data.py | 14 +- doubleml/did/datasets/dgp_did_SZ2020.py | 25 +- .../dgp_pliv_multiway_cluster_CKMS2021.py | 8 +- doubleml/tests/test_exceptions_fixed.py | 0 doubleml/tests/test_multiway_cluster.py | 9 +- doubleml/tests/test_nonlinear_cluster.py | 13 +- doubleml/tests/test_return_types_fixed.py | 0 doubleml/tests/test_sensitivity_cluster.py | 4 +- doubleml/utils/_aliases.py | 36 ++- doubleml/utils/_check_return_types_fixed.py | 0 19 files changed, 313 insertions(+), 496 deletions(-) delete mode 100644 doubleml/data/base_data_content.txt delete mode 100644 doubleml/data/cluster_data.py create mode 100644 doubleml/tests/test_exceptions_fixed.py create mode 100644 doubleml/tests/test_return_types_fixed.py create mode 100644 doubleml/utils/_check_return_types_fixed.py diff --git a/doubleml/__init__.py b/doubleml/__init__.py index 102ea995..6cf7de96 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -1,6 +1,6 @@ import importlib.metadata -from .data import DoubleMLClusterData, DoubleMLData +from .data import DoubleMLClusterData, DoubleMLData, DoubleMLDIDData, DoubleMLPanelData, DoubleMLRDDData, DoubleMLSSMData from .did.did import DoubleMLDID from .did.did_cs import DoubleMLDIDCS from .double_ml_framework import DoubleMLFramework, concat @@ -29,6 +29,10 @@ "DoubleMLIIVM", "DoubleMLData", "DoubleMLClusterData", + "DoubleMLDIDData", + "DoubleMLPanelData", + "DoubleMLRDDData", + "DoubleMLSSMData", "DoubleMLDID", "DoubleMLDIDCS", "DoubleMLPQ", diff --git a/doubleml/data/__init__.py b/doubleml/data/__init__.py index 4c235a57..7d368b76 100644 --- a/doubleml/data/__init__.py +++ b/doubleml/data/__init__.py @@ -2,6 +2,7 @@ The :mod:`doubleml.data` module implements data classes for double machine learning. """ +from .base_data import DoubleMLData import warnings from .base_data import DoubleMLData @@ -14,11 +15,10 @@ class DoubleMLClusterData(DoubleMLData): """ Backwards compatibility wrapper for DoubleMLData with is_cluster_data=True. - This class is deprecated and will be removed in a future version. Use DoubleMLData with is_cluster_data=True instead. """ - + def __init__( self, data, @@ -33,15 +33,14 @@ def __init__( force_all_x_finite=True, ): warnings.warn( - "DoubleMLClusterData is deprecated. " - "Use DoubleMLData with is_cluster_data=True instead.", + "DoubleMLClusterData is deprecated. " "Use DoubleMLData with is_cluster_data=True instead.", FutureWarning, stacklevel=2, ) super().__init__( data=data, y_col=y_col, - d_cols=d_cols, + d_cols=d_cols, x_cols=x_cols, z_cols=z_cols, cluster_cols=cluster_cols, @@ -50,7 +49,7 @@ def __init__( force_all_d_finite=True, is_cluster_data=True, ) - + @classmethod def from_arrays( cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True @@ -60,15 +59,14 @@ def from_arrays( This method is deprecated, use DoubleMLData.from_arrays with is_cluster_data=True instead. """ warnings.warn( - "DoubleMLClusterData is deprecated. " - "Use DoubleMLData.from_arrays with is_cluster_data=True instead.", + "DoubleMLClusterData is deprecated. " "Use DoubleMLData.from_arrays with is_cluster_data=True instead.", FutureWarning, stacklevel=2, ) return DoubleMLData.from_arrays( - x=x, - y=y, - d=d, + x=x, + y=y, + d=d, z=z, cluster_vars=cluster_vars, use_other_treat_as_covariate=use_other_treat_as_covariate, @@ -78,11 +76,4 @@ def from_arrays( ) -__all__ = [ - "DoubleMLData", - "DoubleMLClusterData", - "DoubleMLDIDData", - "DoubleMLPanelData", - "DoubleMLRDDData", - "DoubleMLSSMData", -] +__all__ = ["DoubleMLData", "DoubleMLClusterData", "DoubleMLDIDData", "DoubleMLPanelData", "DoubleMLRDDData", "DoubleMLSSMData"] diff --git a/doubleml/data/base_data_content.txt b/doubleml/data/base_data_content.txt deleted file mode 100644 index 1ccdf7ca9a7ea2f43ee36364c7123001ff431e8c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 60862 zcmeI5|8EsXlECNZlkR_bXMVtF7boofvdW1Q93*>4$tJoW6p4!*1BS2xlf#ezyZP7e zu0B&Ps=BAUr{DNJFvPN8?DwXt>w8sK*Yy0~|2`g`4nGgihUdeJVKqFK-~M^{r{SUe z{Y<`D4F|)$;r{qdyzlz(AH$8|k0;+B%h=~~=Y6?89S-IHA7%7|5faeSIgW;xr?`MH{Bv1|p?$Q5$K7s1^9S=VZPh`#m!Qg1v6WA|gz9-|DbY37F%bifV zFZVx@pM9b9v5b2xpB-kD*CFo>zl`G_kJvHaGl2p3B9HD17Y>CdKaI0JA35}TbTmAY-+w$A@k~C0Z%6Vi(%K)c1P(ht9>}*ZMP@*FD*s20?8&IZ zaUL)|5a?yh;SRidDf1l4m_Nx(d}cHtEBkj}=Fq!W!_6@FSHqX0BYblxpEK9Nusf*U zTpB(b$FGEE{OpWBcK1p&s|<5BVh#6@*`bVd$@9r0`2>AouFJv$H1a384=p-AnZ@w7 zxgW^)4`tlG{Fo;2`|gPO6ZsA;H>f%02IcVx<4EXND?*+}0s%~p1dBs~XE1<=9`C*7 z%NL_=U^zATHV;;Yra=4c;kCdy96lfAf==uT7CUd}fm<&}99F^$Mm!s_tj)a=Y!%|3 z-23-&Brpx{OT%w(Vc4DVKKm?=&@sOX*4Qu>tu_etJ3=keFxu>z^3rDC6E3<99Lsq4 zurCy1z5gTE!%V9To?RpF?g%E>Q_8)a;g&#i+pf#$Gr{_X;Dfay5oC>)yc8Or$lv$n zQ>=NFV2!6c5UF5;b^4JmdWW?d^{e6P2w!!(rNiorQw^?Zl|kPZeDE#!gy*6II$H^k z{7(MH_wQC@c~9VCU0{!|*&Dtc?}uZLgxg_Op@jz4z_5MAv3tYcWL{z|GtdA0w{UHziQOLr($2trmpgvIQuS~p)Ter5^sYMKTpnkf$IRrP z>t4xs*!)W=a!-uDsQBd7eD_+GMBVj?-->qO@*&Koq3 z?=|W*D=A6m_L$juRhHgI?NbohSaA1(+^weIeO( zZehsx*{EUUHk$pCMUZt99mu`;pA3|hnL{T#*Kb-Y2%sLy5BWQq1%zGs`*@`AzI>uO zPtF3qa9lMn>e&VPRI}kH^6g_8i&bzZmPjlkCnJ|DYdP5*xL%U)jZ4sqhTDvq(I_8* z%AI5+`f$MRTn5{+J)fZ^<m5H6TIdDz*hDQLma^1wDh{Y{< z#R`bkYLCQhtz{mOYs=DZ3LZ#vS3HAP5K61~Jc6kAj-yVqPFFcZef})q{n_xt1V6+4 z2eBXgtYyd8B9=?8r0ZX76pK@@@pyP6+{7lWdeWLTR-;NQ_aBCfA{XqN|9=(88a4Ir zoCdFf?TwxQOGXo33w^+ua^V~KR;`^lir-1CzLcFtdMskDMz5daGf!QFWx7f1z-t-o zDb}w~w3hsVXrmh6f`#tA7&RSho%-YH9MqjEff@EGz z<|ivAZmDMn_Byygi|1O%+VsKrZr59)xhv6_AI3bK8Tp-AH7ZWI2Zwe~xTV!ZKG7V7 zu_j4=C))iwW(QQa+F*N~es z?xASIOTm!rMl(I;GacpksZB*{)u!zJWjc-L?c;Z}GShRr_EKZ_*Z92⁣89z1ajE zC7p<$sa~)jVAdG(DXrqGmbpf1RT3{r^^6QZLNw0Z8l#feF|jFp0kOyI9+_IUJVkSV$fOVV0F--6h4$s_}y#7o>}7UScw>>FJ#V@e1et!B5?5U8fnSt@$`I${=mNo zk%|>Klx3&27q{ZJ9+l{1M3mt)V^>+Wti0|?qeV}1vyvYr*G)ZFj5X%1K3qRq$|XFf zNwe$+qZO4c@!Yc#0r4c~p4H>|wmd5ylRA+5Oy;zjI==0Uuh-Iu?LKOK z3qnHgPUBYT)f^SLE2g5BzvlkBrtUn)F1$ zo7kxpYnN0!Zwjr{Lf3=3Cp>y0aQ9?B9gnTO7-MESpY4!P!+#`iM&Lob%QV+^)^X;z@x zMRZlNmFb)pCD$v>tCd?;OGwC9O~f_g|I_g^!>uMUdXKbY>!M(Bam*?&%4fA%*WspR z%<$8ywIxb+MQ_Qbwbzf_c60pnnQIjF@X?_tW;Jf^1>XA@b{}#IBOiP>H-3sIM0(Rc zyLgxR3@k@`?@Hey0qpaSi6+@tuP2tjk9EQF9kHT&^=+lBd~d9j&E1=>w7vc}H*RU2 z=jz+FVHx*QCYi6um%zxg(K#6Hx=GsI74;$Ci82ky_1QX0%^R_M+Sl#LY;i`L!Sr)a ztlZ?Nw&Z`e$3-pFk@1uE*r;>Q~S0JGun&7iJ!E`wq>u-+V`~KBONN1 z+K0_b@mO{US#fKpwZ-UKkDp6=sTC>>OGme<>#<;+zvs^N=gKX;K39Hcd#-({0U#lw z4i@FvD_JxBmM)i6!u4_U63%DM_pkcrO-J_3ta`XVQu{_CY?>vR-_hEQ(_XVzGF&*t zP8!uF+r3Hq?hUG49WFhB`amalrg*@v)vOa$hza+6B!Y;K3$jOyYQ@2 zsP$Ogh*}0a+*&EqI##Por>#v!3ts7gVD5BApR4^+N>zjxGZ(F(oOf406dU~TyQ?i9 zIPa?V-v6li<`?Yx`{}ASis44&i2DLtgXCT5o9eD^KTkmK_198KZ*|I00p=+lT}j1E zwbgrVlrsZ$=(+w#-RI8dihi|swi^-wyr!#tFFD+pSN6iaB~eiQw)c9}_UrTcVqV>v zZyGg>kK2vdH7#5Xe;BhQqp8nx^uN*HM)y{0}uH0H9pKb|!x&~stPA7rj=O){El3#s{DtS86!amr7%(sg*u|Q2G3g zSb^of<&mkpJ-?kBw{$%>R}1M`r!9ezufOJCwDo)anv7QkP5ySpKTp>Kd53CV)44f% zmxFO9%e0Q}5!Y3Eh1c)1HLT|PZSB6fzFWKRl~^-AWE~7knr`r}i%-ywVRm<^td@FB zrn|>Bo)y}5@v*gBx)z?spE{20@NXN(`3#4>6XTj)-<)7(zQuOfwRtJ(6}~mbljwK~ou+^{Rc9U>a^c646T5htzLq)cqd1V~kes!}$374~=Gg z^Y5b%(I{45;QsMr?MF0v(`Zo223#>J=>mnlYNXv@rT>bXcU~*KeqZCZH~OUV4vG{JH3wm(6`(3nBy_+JJVi( zO>`=Hy)y5t%dvR>9d?FMy{4wS4%G0-2w2zH6BwKk$FocA=dh^L>uRXhV`Vt#Og%7H z!@rDs5<7y6^+4NGH#<`2X|20#wz#UgGs4NwaO;6$(5C0T_0iEjv&x{`m0i;wxa(l9 zJF(QI*2iV13mbi+6CJSA6KwU3Zum1wwstN*nZzk~J#>|ON!Ynm{W=;~F51U z^u(`ta&Wp=7@vUJAM4xs_a)C5pKZL_TRhosa ztvAxU_gJtcTf8D&S37c5&xp2b7d8FeoZ^6;v%cfp;$0azrTtRjj6TCaIhO9GdLPWX zwiUal=+@G%u}#@R9m|%5)JC1&*SGDaHgB&r)yB`w>d(8Z@896bvU%HjG}XytH z)h%o@zeBr8H*2r?961))ZhnvAvDBW+(CG5WJ>0yNbMAgVs#gx=oH@0;ZJ@^AK0jE3 z3J)JF_J1WBTbzkK#cG*%#nbK#-z~=ARe{T3U>?`57vvK=2bW3*&)2h0q^CRD^W1)x z$wT>uo?6XJ>7Atdl6HYxU46O(l|DglI9NBTQeJyE1g8oSX;l7ta-ep)>pM1fCGO%B3vsTfDN0k>y|Q`l6o5Zkbk{3z}=? zG{qR+_RS@31C0}LknFB-!|p-4hS&F_tPg~HWU5ysBPh?)_Q9v&P3nc_v?aaUvk8qZ zzHWh*{9ffQ=Sh_0Y?@$PikQrEGM%f&)zZB53ECM48XS zeP^_=_MK60+jpjXGi$ANZyRk(JLKB9lB4ljYe}pQFZH z6Q9c27VS3{yb?XLryy#T>-tX6b(w+H0ab%~pYXpGHTkBD;u%~{r@1e?H>=^R6G#ul z8o*jl(!zgvkInrtK4CAcru76h@b7(=vy97a!G1NaaDYRUx>D-h_QlGq-8)P}xSN_DYX}*(nvWvuCM2 z)PnhkVY@L(Z`XENC?8(H@w$WgKJMI)cu&|wubGV)VVwl zS(e$>JtH-=oV%9Q*tKtco6cdYk!8}#^9{?CPb0(I8jFu^9YE)CnAgiEmpPm*^gx4pc()`TkaDK5KaXHs404j7YUCC%P)|E%=N!Oi1 zj%YZv-+t&gEz@kZ6|z*bNbmn~*ihfsUjHJo`=+wbw54`=VzmzlBrzPDGCHs342heiI^|erG-d>sz3peYx{C$Nqd=Q^5IM z;oU94Kq2dS!>KQ-dC9sqd?py*7_%@s^3*n|@=V_w>husPYu>_WNjay!D&?3yKhLhV zK2Z)w?LMyh#!+3NSiQz`ZhZd|JA7As#WJpsD@^o(XVqCHlI`}`aP+bM#6ME>O4Ym! zoQw77re4!!Te6wHyrO?L^B;L+O?9Unz}fg;^J^qdy8I`2(s?^RyK>n zr-n@qF_joH;sh!?=PF97uWs}4yxq)vtn)3dbC%Y%)_tQ{JZ(hWP$p)$I z@xDN<_@F!boTk1#g#3;Zbm=4~pJa6vC2pEi$Dx+#rdp8OV813=9v4qjsd?l&mMu>C z48C?d3#&9Z=(wo|ooZxU4VJTvKEGO;?bD(zq!LfZEt7NW>_W4>^=1C8=reDFwdcp} z%?P~r;*or=bxB>{5eefn=loTB^9+%-r+Q?F;lBAv zqK9Sz;hT6_H z6-rO#;Jl_``AWRTn#NSCSBW-G`^k0h@VsgfIjX;>YwwtuoSmx0oV?HL6195I9OvdL z5Bd1$`r>5}eTlrUF#WN^$*oyfD8Hb)&>G-ngFu?s@O**IB!Do^7MQjPr(^tYJh97}KG zr_u$xv|bb;d4{|G%F1$kU;Eik^>m%YJe%D0j5MHcQRl8DyqeGWBP{2~_A~xCZ#(1P zT(0g`XTHKeDepU`Sy>Mzz59)E->CF-c$BGGN`CcRyk85+7TlJWX6XHUh-YQ*upFmT z`ED!gjBZw}JT7hXnwUJFmy%;yy)`nR7|_^j=m z!gKmf^)^KxwKSd+D=n{_lQ?gNv^YK&#jP6%-*y)Icv`X8J)G~$>&naRx87gUUQIsM-==4hFkz4|QCO3mAyrC1QDFyD*z`Wwn3 z1${eaai?|@9u^(by0q_Gqi0iJyXnW}C9 zH)gx}HTtckvBh9|gjrYPIW@x~4pp-Aj#T66>ZBUwf2Y5tYp89lGHuSNEq6Lso2S!- zO|LnPl9X>6Io|c!({P#RYK&nvs&<`b8K;v#d)3+xJNo2Sw$ejuImIzjh->(L?D-R&W_e#Mv01xSjd#metmQ7s_$19Vvs%9^Mx|ufvXc{Mh|$Wk z@cvvQ?YW75Tl?H=vz0Z!PixaTwP!F|TVHmZW3~y&UKicK3&v-_Be$pWT$U-nOPRXn z)UDwxKEt>4OaxyOtZ&)3WQFwmd5=)N`#H7Hs6k7mlEf~1HNlU4juCQ*4|K$V+d*08_&nEx- z%cw2&3Sa%%qfO}Wc56}>gudT%*Fw)}HEyBX@$Dbp8_AaV@E%T2{dLc%dEeXv#ihJ` zKiq>)+voOlO!W0xYgS0K)7|qA^>lZ7o8k#kJd3TT)kWI$bbq|MeOo?7eF`nF%2~`n zTRdJ^1bLtDTX1^8ZBr34F${gB51v_EaZMyVv#Y_)J!Q+SYV8OHa@uJQ*?gd4zjee#h>? z!dGLQzXDgZ6TGCM-^d;8-!eu|yDY!UA%9yif0y7*u`oXC``ERvR~l(rb^V01ysM&* zJCXmK`JyLb>*-6X6WqZW-|L^fq!!%XC92bm=Ji!FV<*wlZa(&v>bOjIj?!}%%KhMV zc>U)*wqPtj_puyVR9^KN>;2ArV`fdI<9GS1>rJz@90?_-T%>P30JeEA| z6ZzrnD=6ngOns94seF1X)=bDj#$c(aKaZ#wUaytntpm>%*TQ_d>88f4W&0bF8bT7DyXSsFc@LWg2~`(1D< zmH$!><(MzwklPDq5#*X!%W3OYQGUKW&hlONEIF63g=i0!C9ijuh~ZMplJSFhI$cFS z6knBQ#`tRTRyzyxmE6ranPB4{9S_bq7Mvx>_k5%039||?dn;$Aab;+$QkRUKMXbG< z@m|~Qyjxx0)K(yKv((1T&zP>UEpI+5d%-18{h2je#n?6XsWpkl9{c`YW8tm`rL+J<$ylExe3$>M3G(~5=e`Z~Sr0ekLEHqKrF{+@~jIkc{6xG&Pq z-*aml!_#Lo+10okWoP%Ep1URan{4dL%J3{mV49PZWau)995b1;=q=mV1(9=WmFO$N zG@J2wu5mnSi0<0CE=NpFtF*fHqiCIG7O_qUM!x#=6{o!;RV`SvnD$Du#1AM=v9*(j zQ#{qG()H(AT4APsEaGnVQ?s>5z1{n6Y;C9U-UjzRx*q@Jq(Y$YpS5VLF~q6^eKxZz z9*ce#_uu@@n`TY&D(k5aEbk$h=dkOJWG+CAeAOAKBi#@kr}E4mEcx_G{Is5CLB-`~ z!6B{ou0~i(YeLM6cP?qZ;X93u>*k_*CV*LZO?Rf=|KrOhO1ZxK#x&cB_fA(HC5hqP zv|8<_9mLF<_mZMNkH!pu>cZWo_cShZGxuR|Ky_Da9YWlXD z*j_qsj|IoMb1UwdJ=$eDd&%OJA5a#@wJ9xE4YVqxHN6+|7H1&q8&(g;-&JxhS&NP9 zNWER{F$J3>Nq7jJarT3B=}U zj-=Y6bLv=R7iUpfmh8u{O57?VvrnHuByEX3j+7YtE$uGv;3079vl|Kj1U$u7| z=}w8wPJFyK{7s@L{Y19zq~c~UbZti6!ZN&+5~45B`rxF?$F#(*^**L_w;|e6n`3Yy za^srd+T!qh2Y!3JO9&sPJ;v;b*W{JfDu8WXj}`cR85z$D0_LWkGo!lKre)fNb6YZ{ QZ9l&~hZ8Z|`HF1+ANk8VaR2}S diff --git a/doubleml/data/cluster_data.py b/doubleml/data/cluster_data.py deleted file mode 100644 index 290c61f5..00000000 --- a/doubleml/data/cluster_data.py +++ /dev/null @@ -1,285 +0,0 @@ -import io - -import numpy as np -import pandas as pd -from sklearn.utils import assert_all_finite -from sklearn.utils.validation import check_array - -from doubleml.data.base_data import DoubleMLBaseData, DoubleMLData -from doubleml.utils._estimation import _assure_2d_array - - -class DoubleMLClusterData(DoubleMLData): - """Double machine learning data-backend for data with cluster variables. - - :class:`DoubleMLClusterData` objects can be initialized from - :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. - - Parameters - ---------- - data : :class:`pandas.DataFrame` - The data. - - y_col : str - The outcome variable. - - d_cols : str or list - The treatment variable(s). - - cluster_cols : str or list - The cluster variable(s). - - x_cols : None, str or list - The covariates. - If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor - treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. - Default is ``None``. - - z_cols : None, str or list - The instrumental variable(s). - Default is ``None``. - - t_col : None or str - The time variable (only relevant/used for DiD Estimators). - Default is ``None``. - - s_col : None or str - The score or selection variable (only relevant/used for RDD and SSM Estimatiors). - Default is ``None``. - - use_other_treat_as_covariate : bool - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - Default is ``True``. - - force_all_x_finite : bool or str - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are - allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). - Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used - for the nuisance functions are capable to provide valid predictions with missings and / or infinite values - in the covariates ``x``. - Default is ``True``. - - Examples - -------- >>> from doubleml import DoubleMLClusterData - >>> from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 - >>> # initialization from pandas.DataFrame - >>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame') - >>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z') - >>> # initialization from np.ndarray - >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') - >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) - """ - - def __init__( - self, - data, - y_col, - d_cols, - cluster_cols, - x_cols=None, - z_cols=None, - t_col=None, - s_col=None, - use_other_treat_as_covariate=True, - force_all_x_finite=True, - ): - DoubleMLBaseData.__init__(self, data) # we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter - self.cluster_cols = cluster_cols - self._set_cluster_vars() - DoubleMLData.__init__( - self, data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite, is_cluster_data=True - ) - self._check_disjoint_sets_cluster_cols() - - def __str__(self): - data_summary = self._data_summary_str() - buf = io.StringIO() - self.data.info(verbose=False, buf=buf) - df_info = buf.getvalue() - res = ( - "================== DoubleMLClusterData Object ==================\n" - + "\n------------------ Data summary ------------------\n" - + data_summary - + "\n------------------ DataFrame info ------------------\n" - + df_info - ) - return res - - def _data_summary_str(self): - data_summary = ( - f"Outcome variable: {self.y_col}\n" - f"Treatment variable(s): {self.d_cols}\n" - f"Cluster variable(s): {self.cluster_cols}\n" - f"Covariates: {self.x_cols}\n" - f"Instrument variable(s): {self.z_cols}\n" - ) - if self.t_col is not None: - data_summary += f"Time variable: {self.t_col}\n" - if self.s_col is not None: - data_summary += f"Score/Selection variable: {self.s_col}\n" - - data_summary += f"No. Observations: {self.n_obs}\n" - return data_summary - - @classmethod - def from_arrays( - cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True - ): - """ - Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s. - - Parameters - ---------- - x : :class:`numpy.ndarray` - Array of covariates. - - y : :class:`numpy.ndarray` - Array of the outcome variable. - - d : :class:`numpy.ndarray` - Array of treatment variables. - - cluster_vars : :class:`numpy.ndarray` - Array of cluster variables. - - z : None or :class:`numpy.ndarray` - Array of instrumental variables. - Default is ``None``. - - t : :class:`numpy.ndarray` - Array of the time variable (only relevant/used for DiD models). - Default is ``None``. - - s : :class:`numpy.ndarray` - Array of the score or selection variable (only relevant/used for RDD or SSM models). - Default is ``None``. - - use_other_treat_as_covariate : bool - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - Default is ``True``. - - force_all_x_finite : bool or str - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are - allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). - Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used - for the nuisance functions are capable to provide valid predictions with missings and / or infinite values - in the covariates ``x``. - Default is ``True``. - - Examples - -------- >>> from doubleml import DoubleMLClusterData - >>> from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 - >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') - >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) - """ - dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite, is_cluster_data=True) - cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False) - cluster_vars = _assure_2d_array(cluster_vars) - if cluster_vars.shape[1] == 1: - cluster_cols = ["cluster_var"] - else: - cluster_cols = [f"cluster_var{i + 1}" for i in np.arange(cluster_vars.shape[1])] - - data = pd.concat((pd.DataFrame(cluster_vars, columns=cluster_cols), dml_data.data), axis=1) - - return cls( - data, - dml_data.y_col, - dml_data.d_cols, - cluster_cols, - dml_data.x_cols, - dml_data.z_cols, - dml_data.t_col, - dml_data.s_col, - dml_data.use_other_treat_as_covariate, - dml_data.force_all_x_finite, - ) - - @property - def cluster_cols(self): - """ - The cluster variable(s). - """ - return self._cluster_cols - - @cluster_cols.setter - def cluster_cols(self, value): - reset_value = hasattr(self, "_cluster_cols") - if isinstance(value, str): - value = [value] - if not isinstance(value, list): - raise TypeError( - "The cluster variable(s) cluster_cols must be of str or list type. " - f"{str(value)} of type {str(type(value))} was passed." - ) - if not len(set(value)) == len(value): - raise ValueError("Invalid cluster variable(s) cluster_cols: Contains duplicate values.") - if not set(value).issubset(set(self.all_variables)): - raise ValueError("Invalid cluster variable(s) cluster_cols. At least one cluster variable is no data column.") - self._cluster_cols = value - if reset_value: - self._check_disjoint_sets() - self._set_cluster_vars() - - @property - def n_cluster_vars(self): - """ - The number of cluster variables. - """ - return len(self.cluster_cols) - - @property - def cluster_vars(self): - """ - Array of cluster variable(s). - """ - return self._cluster_vars.values - - def _get_optional_col_sets(self): - base_optional_col_sets = super()._get_optional_col_sets() - cluster_cols_set = set(self.cluster_cols) - return [cluster_cols_set] + base_optional_col_sets - - def _check_disjoint_sets(self): - # apply the standard checks from the DoubleMLData class - super(DoubleMLClusterData, self)._check_disjoint_sets() - self._check_disjoint_sets_cluster_cols() - - def _check_disjoint_sets_cluster_cols(self): - # apply the standard checks from the DoubleMLData class - super(DoubleMLClusterData, self)._check_disjoint_sets() - - # special checks for the additional cluster variables - cluster_cols_set = set(self.cluster_cols) - y_col_set = {self.y_col} - x_cols_set = set(self.x_cols) - d_cols_set = set(self.d_cols) - - z_cols_set = set(self.z_cols or []) - t_col_set = {self.t_col} if self.t_col else set() - s_col_set = {self.s_col} if self.s_col else set() - - # TODO: X can not be used as cluster variable - cluster_checks_args = [ - (y_col_set, "outcome variable", "``y_col``"), - (d_cols_set, "treatment variable", "``d_cols``"), - (x_cols_set, "covariate", "``x_cols``"), - (z_cols_set, "instrumental variable", "``z_cols``"), - (t_col_set, "time variable", "``t_col``"), - (s_col_set, "score or selection variable", "``s_col``"), - ] - for set1, name, argument in cluster_checks_args: - self._check_disjoint( - set1=set1, - name1=name, - arg1=argument, - set2=cluster_cols_set, - name2="cluster variable(s)", - arg2="``cluster_cols``", - ) - - def _set_cluster_vars(self): - assert_all_finite(self.data.loc[:, self.cluster_cols]) - self._cluster_vars = self.data.loc[:, self.cluster_cols] diff --git a/doubleml/data/did_data.py b/doubleml/data/did_data.py index 150aeb7d..b528ead8 100644 --- a/doubleml/data/did_data.py +++ b/doubleml/data/did_data.py @@ -1,5 +1,4 @@ import io -import numpy as np import pandas as pd from sklearn.utils.validation import check_array @@ -30,7 +29,8 @@ class DoubleMLDIDData(DoubleMLData): x_cols : None, str or list The covariates. If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor - treatment variables ``d_cols``, nor instrumental variables ``z_cols``, nor time variable ``t_col`` are used as covariates. + treatment variables ``d_cols``, nor instrumental variables ``z_cols``, nor time variable ``t_col`` + are used as covariates. Default is ``None``. z_cols : None, str or list @@ -56,10 +56,9 @@ class DoubleMLDIDData(DoubleMLData): force_all_d_finite : bool Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``. - Default is ``True``. - - Examples - -------- >>> from doubleml import DoubleMLDIDData + Default is ``True``. Examples + -------- + >>> from doubleml import DoubleMLDIDData >>> from doubleml.did.datasets import make_did_SZ2020 >>> # initialization from pandas.DataFrame >>> df = make_did_SZ2020(return_type='DataFrame') @@ -74,18 +73,18 @@ def __init__( data, y_col, d_cols, - t_col, x_cols=None, z_cols=None, + t_col=None, cluster_cols=None, use_other_treat_as_covariate=True, force_all_x_finite=True, force_all_d_finite=True, ): - # Set time column before calling parent constructor - self.t_col = t_col + # Initialize _t_col to None first to avoid AttributeError during parent init + self._t_col = None - # Call parent constructor + # Call parent constructor first to set _data super().__init__( data=data, y_col=y_col, @@ -97,7 +96,10 @@ def __init__( force_all_x_finite=force_all_x_finite, force_all_d_finite=force_all_d_finite, ) - + + # Set time column after parent constructor (which sets _data) + self.t_col = t_col + # Set time variable array after data is loaded self._set_time_var() @@ -168,15 +170,15 @@ def from_arrays( if t.shape[1] != 1: raise ValueError("t must be a single column.") t_col = "t" - + # Create base data using parent class method base_data = DoubleMLData.from_arrays( x, y, d, z, cluster_vars, use_other_treat_as_covariate, force_all_x_finite, force_all_d_finite ) - + # Add time variable to the DataFrame data = pd.concat((base_data.data, pd.DataFrame(t, columns=[t_col])), axis=1) - + return cls( data, base_data.y_col, @@ -201,15 +203,14 @@ def t_col(self): def t_col(self, value): if not isinstance(value, str): raise TypeError( - "The time variable t_col must be of str type. " - f"{str(value)} of type {str(type(value))} was passed." + "The time variable t_col must be of str type. " f"{str(value)} of type {str(type(value))} was passed." ) # Check if data exists (during initialization it might not) - if hasattr(self, '_data') and value not in self.all_variables: + if hasattr(self, "_data") and value not in self.all_variables: raise ValueError("Invalid time variable t_col. The time variable is no data column.") self._t_col = value # Update time variable array if data is already loaded - if hasattr(self, '_data'): + if hasattr(self, "_data"): self._set_time_var() @property @@ -217,13 +218,37 @@ def t(self): """ Array of time variable. """ - return self._t.values + if self.t_col is not None: + return self._t.values + else: + return None + + @t_col.setter + def t_col(self, value): + reset_value = hasattr(self, "_t_col") + if value is not None: + if not isinstance(value, str): + raise TypeError( + "The time variable t_col must be of str type (or None). " + f"{str(value)} of type {str(type(value))} was passed." + ) + if value not in self.all_variables: + raise ValueError(f"Invalid time variable t_col. {value} is no data column.") + self._t_col = value + else: + self._t_col = None + if reset_value: + self._check_disjoint_sets() + self._set_y_z_t_s() + def _get_optional_col_sets(self): """Get optional column sets including time column.""" base_optional_col_sets = super()._get_optional_col_sets() - t_col_set = {self.t_col} - return [t_col_set] + base_optional_col_sets + if self.t_col is not None: + t_col_set = {self.t_col} + return [t_col_set] + base_optional_col_sets + return base_optional_col_sets def _check_disjoint_sets(self): """Check that time column doesn't overlap with other variables.""" @@ -259,7 +284,7 @@ def _check_disjoint_sets_t_col(self): def _set_time_var(self): """Set the time variable array.""" - if hasattr(self, '_data') and self.t_col in self.data.columns: + if hasattr(self, "_data") and self.t_col in self.data.columns: self._t = self.data.loc[:, [self.t_col]] def __str__(self): diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py index f34b2ee1..c1ec3bb5 100644 --- a/doubleml/data/panel_data.py +++ b/doubleml/data/panel_data.py @@ -67,8 +67,7 @@ class DoubleMLPanelData(DoubleMLData): ... y_col="y", ... d_cols="d", ... id_col="id", - ... t_col="t", - ... x_cols=["Z1", "Z2", "Z3", "Z4"], + ... t_col="t", ... x_cols=["Z1", "Z2", "Z3", "Z4"], ... datetime_unit="M" ... ) """ @@ -83,16 +82,20 @@ def __init__( x_cols=None, z_cols=None, use_other_treat_as_covariate=True, - force_all_x_finite=True, datetime_unit="M", + force_all_x_finite=True, + datetime_unit="M", ): DoubleMLBaseData.__init__(self, data) # we need to set id_col (needs _data) before call to the super __init__ because of the x_cols setter self.id_col = id_col self._datetime_unit = _is_valid_datetime_unit(datetime_unit) - self._set_id_var() # Set t_col first before calling parent constructor + self._set_id_var() + + # Set time column before calling parent constructor self.t_col = t_col + # Call parent constructor DoubleMLData.__init__( self, data=data, @@ -104,11 +107,14 @@ def __init__( force_all_x_finite=force_all_x_finite, force_all_d_finite=False, ) + + # Set time variable array after data is loaded + self._set_time_var() + if self.n_treat != 1: raise ValueError("Only one treatment column is allowed for panel data.") self._check_disjoint_sets_id_col() - self._set_t() # intialize the unique values of g and t self._g_values = np.sort(np.unique(self.d)) # unique values of g @@ -151,9 +157,8 @@ def datetime_unit(self): """ The unit of the time variable. """ - return self._datetime_unit + return self._datetime_unit @ property - @property def d(self): """ Array of treatment variable; @@ -171,7 +176,7 @@ def t(self): """ Array of time variable. """ - if pd.api.types.is_datetime64_any_dtype(self._d): + if pd.api.types.is_datetime64_any_dtype(self._t): return self._t.values.astype(f"datetime64[{self.datetime_unit}]") else: return self._t.values @@ -216,14 +221,15 @@ def n_obs(self): """ The number of observations. For panel data, the number of unique values for id_col. """ - return len(self._id_var_unique) @property + return len(self._id_var_unique) + + @property def g_col(self): """ The treatment variable indicating the time of treatment exposure. """ - return self._d_cols[0] + return self._d_cols[0] @ DoubleMLData.d_cols.setter - @DoubleMLData.d_cols.setter def d_cols(self, value): super(self.__class__, self.__class__).d_cols.__set__(self, value) if hasattr(self, "_g_values"): @@ -232,7 +238,8 @@ def d_cols(self, value): @property def g_values(self): """ - The unique values of the treatment variable (groups) ``d``. """ + The unique values of the treatment variable (groups) ``d``. + """ return self._g_values @property @@ -253,25 +260,19 @@ def t_col(self): def t_col(self, value): if value is None: raise TypeError("Invalid time variable t_col. Time variable required for panel data.") - reset_value = hasattr(self, "_t_col") if not isinstance(value, str): raise TypeError( - f"The time variable t_col must be of str type. {str(value)} of type {str(type(value))} was passed." + "The time variable t_col must be of str type. " f"{str(value)} of type {str(type(value))} was passed." ) - if value not in self.all_variables: - raise ValueError(f"Invalid time variable t_col. {value} is no data column.") + # Check if data exists (during initialization it might not) + if hasattr(self, "_data") and value not in self.all_variables: + raise ValueError("Invalid time variable t_col. The time variable is no data column.") self._t_col = value - if reset_value: - self._check_disjoint_sets() - self._set_t() - if hasattr(self, "_t_values"): - self._t_values = np.sort(np.unique(self.t)) # update unique values of t - - def _set_t(self): - """Set time variable.""" - if self.t_col is not None: - assert_all_finite(self.data.loc[:, self.t_col]) - self._t = self.data.loc[:, self.t_col] + # Update time variable array if data is already loaded + if hasattr(self, "_data"): + self._set_time_var() + if hasattr(self, "_t_values"): + self._t_values = np.sort(np.unique(self.t)) # update unique values of t @property def t_values(self): @@ -290,13 +291,14 @@ def n_t_periods(self): def _get_optional_col_sets(self): base_optional_col_sets = super()._get_optional_col_sets() id_col_set = {self.id_col} - t_col_set = {self.t_col} # t_col is not None for panel data + t_col_set = {self.t_col} return [id_col_set, t_col_set] + base_optional_col_sets def _check_disjoint_sets(self): # apply the standard checks from the DoubleMLData class super(DoubleMLPanelData, self)._check_disjoint_sets() self._check_disjoint_sets_id_col() + self._check_disjoint_sets_t_col() def _check_disjoint_sets_id_col(self): # apply the standard checks from the DoubleMLData class @@ -329,7 +331,38 @@ def _check_disjoint_sets_id_col(self): arg2="``id_col``", ) + def _check_disjoint_sets_t_col(self): + """Check that time column is disjoint from other variable sets.""" + t_col_set = {self.t_col} + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + z_cols_set = set(self.z_cols or []) + id_col_set = {self.id_col} + + t_checks_args = [ + (y_col_set, "outcome variable", "``y_col``"), + (d_cols_set, "treatment variable", "``d_cols``"), + (x_cols_set, "covariate", "``x_cols``"), + (z_cols_set, "instrumental variable", "``z_cols``"), + (id_col_set, "identifier variable", "``id_col``"), + ] + for set1, name, argument in t_checks_args: + self._check_disjoint( + set1=set1, + name1=name, + arg1=argument, + set2=t_col_set, + name2="time variable", + arg2="``t_col``", + ) + def _set_id_var(self): assert_all_finite(self.data.loc[:, self.id_col]) self._id_var = self.data.loc[:, self.id_col] self._id_var_unique = np.unique(self._id_var.values) + + def _set_time_var(self): + """Set the time variable array.""" + if hasattr(self, "_data") and self.t_col in self.data.columns: + self._t = self.data.loc[:, self.t_col] diff --git a/doubleml/data/rdd_data.py b/doubleml/data/rdd_data.py index 3798dd7e..ac0fff67 100644 --- a/doubleml/data/rdd_data.py +++ b/doubleml/data/rdd_data.py @@ -24,13 +24,13 @@ class DoubleMLRDDData(DoubleMLData): d_cols : str or list The treatment variable(s). - s_col : str + score_col : str The score/running variable for RDD models. x_cols : None, str or list The covariates. If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor - treatment variables ``d_cols``, nor instrumental variables ``z_cols``, nor score variable ``s_col`` are used as covariates. + treatment variables ``d_cols``, nor instrumental variables ``z_cols``, nor score variable ``score_col`` are used as covariates. Default is ``None``. z_cols : None, str or list @@ -74,7 +74,7 @@ def __init__( data, y_col, d_cols, - s_col, + score_col, x_cols=None, z_cols=None, cluster_cols=None, @@ -83,8 +83,8 @@ def __init__( force_all_d_finite=True, ): # Set score column before calling parent constructor - self.s_col = s_col - + self.score_col = score_col + # Call parent constructor super().__init__( data=data, @@ -97,7 +97,7 @@ def __init__( force_all_x_finite=force_all_x_finite, force_all_d_finite=force_all_d_finite, ) - + # Set score variable array after data is loaded self._set_score_var() @@ -107,7 +107,7 @@ def from_arrays( x, y, d, - s, + score, z=None, cluster_vars=None, use_other_treat_as_covariate=True, @@ -128,7 +128,7 @@ def from_arrays( d : :class:`numpy.ndarray` Array of treatment variables. - s : :class:`numpy.ndarray` + score : :class:`numpy.ndarray` Array of the score/running variable for RDD models. z : None or :class:`numpy.ndarray` @@ -157,31 +157,32 @@ def from_arrays( Default is ``True``. Examples - -------- >>> from doubleml import DoubleMLRDDData + -------- + >>> from doubleml import DoubleMLRDDData >>> from doubleml.rdd.datasets import make_rdd_data >>> (x, y, d, s) = make_rdd_data(return_type='array') >>> obj_dml_data_from_array = DoubleMLRDDData.from_arrays(x, y, d, s=s) """ # Prepare score variable - s = check_array(s, ensure_2d=False, allow_nd=False) - s = _assure_2d_array(s) - if s.shape[1] != 1: - raise ValueError("s must be a single column.") - s_col = "s" - + score = check_array(score, ensure_2d=False, allow_nd=False) + score = _assure_2d_array(score) + if score.shape[1] != 1: + raise ValueError("score must be a single column.") + score_col = "score" + # Create base data using parent class method base_data = DoubleMLData.from_arrays( x, y, d, z, cluster_vars, use_other_treat_as_covariate, force_all_x_finite, force_all_d_finite ) - + # Add score variable to the DataFrame - data = pd.concat((base_data.data, pd.DataFrame(s, columns=[s_col])), axis=1) - + data = pd.concat((base_data.data, pd.DataFrame(score, columns=[score_col])), axis=1) + return cls( data, base_data.y_col, base_data.d_cols, - s_col, + score_col, base_data.x_cols, base_data.z_cols, base_data.cluster_cols, @@ -191,49 +192,48 @@ def from_arrays( ) @property - def s_col(self): + def score_col(self): """ The score/running variable. """ - return self._s_col + return self._score_col - @s_col.setter - def s_col(self, value): + @score_col.setter + def score_col(self, value): if not isinstance(value, str): raise TypeError( - "The score variable s_col must be of str type. " - f"{str(value)} of type {str(type(value))} was passed." + "The score variable score_col must be of str type. " f"{str(value)} of type {str(type(value))} was passed." ) # Check if data exists (during initialization it might not) - if hasattr(self, '_data') and value not in self.all_variables: - raise ValueError("Invalid score variable s_col. The score variable is no data column.") - self._s_col = value + if hasattr(self, "_data") and value not in self.all_variables: + raise ValueError("Invalid score variable score_col. The score variable is no data column.") + self._score_col = value # Update score variable array if data is already loaded - if hasattr(self, '_data'): + if hasattr(self, "_data"): self._set_score_var() @property - def s(self): + def score(self): """ Array of score/running variable. """ - return self._s.values + return self._score.values def _get_optional_col_sets(self): """Get optional column sets including score column.""" base_optional_col_sets = super()._get_optional_col_sets() - s_col_set = {self.s_col} - return [s_col_set] + base_optional_col_sets + score_col_set = {self.score_col} + return [score_col_set] + base_optional_col_sets def _check_disjoint_sets(self): """Check that score column doesn't overlap with other variables.""" # Apply standard checks from parent class super()._check_disjoint_sets() - self._check_disjoint_sets_s_col() + self._check_disjoint_sets_score_col() - def _check_disjoint_sets_s_col(self): + def _check_disjoint_sets_score_col(self): """Check that score column is disjoint from other variable sets.""" - s_col_set = {self.s_col} + score_col_set = {self.score_col} y_col_set = {self.y_col} x_cols_set = set(self.x_cols) d_cols_set = set(self.d_cols) @@ -252,21 +252,21 @@ def _check_disjoint_sets_s_col(self): set1=set1, name1=name, arg1=argument, - set2=s_col_set, + set2=score_col_set, name2="score variable", - arg2="``s_col``", + arg2="``score_col``", ) def _set_score_var(self): """Set the score variable array.""" - if hasattr(self, '_data') and self.s_col in self.data.columns: - self._s = self.data.loc[:, [self.s_col]] + if hasattr(self, "_data") and self.score_col in self.data.columns: + self._score = self.data.loc[:, [self.score_col]] def __str__(self): """String representation.""" data_summary = self._data_summary_str() buf = io.StringIO() print("================== DoubleMLRDDData Object ==================", file=buf) - print(f"Score variable: {self.s_col}", file=buf) + print(f"Score variable: {self.score_col}", file=buf) print(data_summary, file=buf) return buf.getvalue() diff --git a/doubleml/data/ssm_data.py b/doubleml/data/ssm_data.py index d8f3988e..301a4234 100644 --- a/doubleml/data/ssm_data.py +++ b/doubleml/data/ssm_data.py @@ -1,5 +1,4 @@ import io -import numpy as np import pandas as pd from sklearn.utils.validation import check_array @@ -30,7 +29,8 @@ class DoubleMLSSMData(DoubleMLData): x_cols : None, str or list The covariates. If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor - treatment variables ``d_cols``, nor instrumental variables ``z_cols``, nor selection variable ``s_col`` are used as covariates. + treatment variables ``d_cols``, nor instrumental variables ``z_cols``, nor selection variable ``s_col`` + are used as covariates. Default is ``None``. z_cols : None, str or list @@ -203,15 +203,14 @@ def s_col(self): def s_col(self, value): if not isinstance(value, str): raise TypeError( - "The selection variable s_col must be of str type. " - f"{str(value)} of type {str(type(value))} was passed." + "The selection variable s_col must be of str type. " f"{str(value)} of type {str(type(value))} was passed." ) # Check if data exists (during initialization it might not) - if hasattr(self, '_data') and value not in self.all_variables: + if hasattr(self, "_data") and value not in self.all_variables: raise ValueError("Invalid selection variable s_col. The selection variable is no data column.") self._s_col = value # Update selection variable array if data is already loaded - if hasattr(self, '_data'): + if hasattr(self, "_data"): self._set_selection_var() @property @@ -261,7 +260,7 @@ def _check_disjoint_sets_s_col(self): def _set_selection_var(self): """Set the selection variable array.""" - if hasattr(self, '_data') and self.s_col in self.data.columns: + if hasattr(self, "_data") and self.s_col in self.data.columns: self._s = self.data.loc[:, [self.s_col]] def __str__(self): diff --git a/doubleml/data/tests/test_cluster_data.py b/doubleml/data/tests/test_cluster_data.py index 09a45ccd..4489e528 100644 --- a/doubleml/data/tests/test_cluster_data.py +++ b/doubleml/data/tests/test_cluster_data.py @@ -9,15 +9,13 @@ @pytest.mark.ci def test_obj_vs_from_arrays(): np.random.seed(3141) - (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="array") - dml_data = DoubleMLData.from_arrays(x, y, d, z=z, cluster_vars=cluster_vars, is_cluster_data=True) + dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], - z=dml_data.data[dml_data.z_cols], - cluster_vars=dml_data.data[dml_data.cluster_cols], - is_cluster_data=True + dml_data.data[dml_data.cluster_cols], + dml_data.data[dml_data.z_cols], ) df = dml_data.data.copy() df.rename( @@ -30,9 +28,8 @@ def test_obj_vs_from_arrays(): dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], - z=dml_data.data[dml_data.z_cols], - cluster_vars=dml_data.data[dml_data.cluster_cols[1]], - is_cluster_data=True + dml_data.data[dml_data.cluster_cols[1]], + dml_data.data[dml_data.z_cols], ) df = dml_data.data.copy().drop(columns="cluster_var_i") df.rename(columns={"cluster_var_j": "cluster_var", "Y": "y", "D": "d", "Z": "z"}, inplace=True) @@ -42,7 +39,7 @@ def test_obj_vs_from_arrays(): @pytest.mark.ci def test_x_cols_setter_defaults_w_cluster(): df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "xx3", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", is_cluster_data=True) + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1") assert dml_data.x_cols == ["xx1", "xx2", "xx3"] dml_data.x_cols = ["xx1", "xx3"] assert dml_data.x_cols == ["xx1", "xx3"] @@ -51,53 +48,48 @@ def test_x_cols_setter_defaults_w_cluster(): # with instrument df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "z", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="z", is_cluster_data=True) + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="z") assert dml_data.x_cols == ["xx1", "xx2"] # without instrument and with time df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", is_cluster_data=True) - assert dml_data.x_cols == ["xx1", "xx2", "tt"] + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt") + assert dml_data.x_cols == ["xx1", "xx2"] # with instrument and with time df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", is_cluster_data=True) - assert dml_data.x_cols == ["xx1", "xx2", "tt"] + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt") + assert dml_data.x_cols == ["xx1", "xx2"] # without instrument and with selection df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "ss", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", is_cluster_data=True) - assert dml_data.x_cols == ["xx1", "xx2", "ss"] + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", s_col="ss") + assert dml_data.x_cols == ["xx1", "xx2"] # with instrument and with selection df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "ss", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", is_cluster_data=True) - assert dml_data.x_cols == ["xx1", "xx2", "ss"] + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", s_col="ss") + assert dml_data.x_cols == ["xx1", "xx2"] # without instrument with time with selection df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "ss", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", is_cluster_data=True) - assert dml_data.x_cols == ["xx1", "xx2", "tt", "ss"] + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt", s_col="ss") + assert dml_data.x_cols == ["xx1", "xx2"] # with instrument with time with selection df = pd.DataFrame(np.tile(np.arange(8), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "ss", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", is_cluster_data=True) - assert dml_data.x_cols == ["xx1", "xx2", "tt", "ss"] + dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt", s_col="ss") + assert dml_data.x_cols == ["xx1", "xx2"] @pytest.mark.ci def test_cluster_cols_setter(): np.random.seed(3141) - (x, y, d) = make_plr_CCDDHNR2018(n_obs=100, return_type="array") - # Create a pandas DataFrame with X, y, and d columns - df = pd.DataFrame(np.column_stack((x[:, :7], y, d)), - columns=[f"X{i + 1}" for i in np.arange(7)] + ["y", "d1", "d2"]) - + dml_data = make_plr_CCDDHNR2018(n_obs=100) + df = dml_data.data.copy().iloc[:, :10] + df.columns = [f"X{i + 1}" for i in np.arange(7)] + ["y", "d1", "d2"] dml_data = DoubleMLData( - df, "y", ["d1", "d2"], - x_cols=[f"X{i + 1}" for i in np.arange(5)], - cluster_cols=[f"X{i + 1}" for i in [5, 6]], - is_cluster_data=True + df, "y", ["d1", "d2"], cluster_cols=[f"X{i + 1}" for i in [5, 6]], x_cols=[f"X{i + 1}" for i in np.arange(5)] ) cluster_vars = df[["X6", "X7"]].values @@ -137,49 +129,56 @@ def test_disjoint_sets(): r"and cluster variable\(s\) \(``cluster_cols``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="yy", is_cluster_data=True) + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="yy") msg = ( r"At least one variable/column is set as treatment variable \(``d_cols``\) " r"and cluster variable\(s\) \(``cluster_cols``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="dd1", is_cluster_data=True) + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="dd1") msg = ( r"At least one variable/column is set as covariate \(``x_cols``\) " r"and cluster variable\(s\) \(``cluster_cols``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="xx2", is_cluster_data=True) + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="xx2") msg = ( r"At least one variable/column is set as instrumental variable \(``z_cols``\) " r"and cluster variable\(s\) \(``cluster_cols``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], z_cols=["xx2"], cluster_cols="xx2", is_cluster_data=True) + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], z_cols=["xx2"], cluster_cols="xx2") + + msg = ( + r"At least one variable/column is set as time variable \(``t_col``\) " + r"and cluster variable\(s\) \(``cluster_cols``\)." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], t_col="xx2", cluster_cols="xx2") + + msg = ( + r"At least one variable/column is set as score or selection variable \(``s_col``\) " + r"and cluster variable\(s\) \(``cluster_cols``\)." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], s_col="xx2", cluster_cols="xx2") @pytest.mark.ci def test_duplicates(): np.random.seed(3141) - (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="array") - df = pd.DataFrame(np.column_stack((x, y, d, z)), - columns=[f"X{i+1}" for i in range(x.shape[1])] + ["Y", "D", "Z"]) - cluster_df = pd.DataFrame(cluster_vars, columns=["cluster_var_i", "cluster_var_j"]) - data = pd.concat([df, cluster_df], axis=1) + dml_cluster_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) msg = r"Invalid cluster variable\(s\) cluster_cols: Contains duplicate values." with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(data, y_col="Y", d_cols=["D"], cluster_cols=["X3", "X2", "X3"], is_cluster_data=True) - - dml_data = DoubleMLData(data, y_col="Y", d_cols=["D"], cluster_cols=["X3", "X2"], is_cluster_data=True) + _ = DoubleMLData(dml_cluster_data.data, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2", "X3"]) with pytest.raises(ValueError, match=msg): - dml_data.cluster_cols = ["X3", "X2", "X3"] + dml_cluster_data.cluster_cols = ["X3", "X2", "X3"] msg = "Invalid pd.DataFrame: Contains duplicate column names." with pytest.raises(ValueError, match=msg): _ = DoubleMLData( - pd.DataFrame(np.zeros((100, 5)), columns=["y", "d", "X3", "X2", "y"]), - y_col="y", d_cols=["d"], cluster_cols=["X2"], is_cluster_data=True + pd.DataFrame(np.zeros((100, 5)), columns=["y", "d", "X3", "X2", "y"]), y_col="y", d_cols=["d"], cluster_cols=["X2"] ) @@ -187,14 +186,13 @@ def test_duplicates(): def test_dml_datatype(): data_array = np.zeros((100, 10)) with pytest.raises(TypeError): - _ = DoubleMLData(data_array, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2"], is_cluster_data=True) + _ = DoubleMLData(data_array, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2"]) @pytest.mark.ci def test_cluster_data_str(): np.random.seed(3141) - (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="array") - dml_data = DoubleMLData.from_arrays(x, y, d, z=z, cluster_vars=cluster_vars, is_cluster_data=True) + dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) # Convert the object to string dml_str = str(dml_data) @@ -205,11 +203,28 @@ def test_cluster_data_str(): assert "------------------ DataFrame info ------------------" in dml_str # Check that specific data attributes are correctly included - assert "Outcome variable: y" in dml_str - assert "Treatment variable(s): ['d']" in dml_str - assert "Cluster variable(s): ['cluster_var1', 'cluster_var2']" in dml_str + assert "Outcome variable: Y" in dml_str + assert "Treatment variable(s): ['D']" in dml_str + assert "Cluster variable(s): ['cluster_var_i', 'cluster_var_j']" in dml_str assert "Covariates: " in dml_str - assert "Instrument variable(s): ['z']" in dml_str - assert "Is cluster data: True" in dml_str - assert "No. Observations:" in dml_str # There's no TimeData or ScoreData here anymore, so the test is complete - # The specialized data classes will be tested in their own test files + assert "Instrument variable(s): ['Z']" in dml_str + assert "No. Observations:" in dml_str + + # Test with additional optional attributes + df = dml_data.data.copy() + df["time_var"] = 1 + df["score_var"] = 0.5 + + dml_data_with_optional = DoubleMLData( + data=df, + y_col="Y", + d_cols="D", + cluster_cols=["cluster_var_i", "cluster_var_j"], + z_cols="Z", + t_col="time_var", + s_col="score_var", + ) + + dml_str_optional = str(dml_data_with_optional) + assert "Time variable: time_var" in dml_str_optional + assert "Score/Selection variable: score_var" in dml_str_optional diff --git a/doubleml/data/tests/test_dml_data.py b/doubleml/data/tests/test_dml_data.py index a2ada74b..7dc94fd4 100644 --- a/doubleml/data/tests/test_dml_data.py +++ b/doubleml/data/tests/test_dml_data.py @@ -3,7 +3,8 @@ import pytest from sklearn.linear_model import Lasso, LogisticRegression -from doubleml import DoubleMLData, DoubleMLDIDCS, DoubleMLPLR, DoubleMLSSM +from doubleml import DoubleMLData, DoubleMLDIDCS, DoubleMLPLR, DoubleMLSSM, DoubleMLDIDData, DoubleMLSSMData + from doubleml.data.base_data import DoubleMLBaseData from doubleml.plm.datasets import ( _make_pliv_data, @@ -102,7 +103,7 @@ def test_obj_vs_from_arrays(): assert np.array_equal(dml_data_from_array.data, dml_data.data) dml_data = make_did_SZ2020(n_obs=100, cross_sectional_data=True) - dml_data_from_array = DoubleMLData.from_arrays( + dml_data_from_array = DoubleMLDIDData.from_arrays( x=dml_data.data[dml_data.x_cols], y=dml_data.data[dml_data.y_col], d=dml_data.data[dml_data.d_cols], @@ -113,7 +114,7 @@ def test_obj_vs_from_arrays(): # check with instrument and time variable dml_data = make_did_SZ2020(n_obs=100, cross_sectional_data=True) dml_data.data["z"] = dml_data.data["t"] - dml_data_from_array = DoubleMLData.from_arrays( + dml_data_from_array = DoubleMLDIDData.from_arrays( x=dml_data.data[dml_data.x_cols], y=dml_data.data[dml_data.y_col], d=dml_data.data[dml_data.d_cols], @@ -146,14 +147,11 @@ def test_dml_data_no_instr_no_time_no_selection(): dml_data = make_plr_CCDDHNR2018(n_obs=100) assert dml_data.z is None assert dml_data.n_instr == 0 - assert dml_data.t is None x, y, d = make_plr_CCDDHNR2018(n_obs=100, return_type="array") dml_data = DoubleMLData.from_arrays(x, y, d) assert dml_data.z is None assert dml_data.n_instr == 0 - assert dml_data.t is None - assert dml_data.s is None @pytest.mark.ci @@ -324,7 +322,7 @@ def test_t_col_setter(): np.random.seed(3141) df = make_did_SZ2020(n_obs=100, cross_sectional_data=True, return_type=pd.DataFrame) df["t_new"] = np.ones(shape=(100,)) - dml_data = DoubleMLData(df, "y", "d", [f"Z{i + 1}" for i in np.arange(4)], t_col="t") + dml_data = DoubleMLDIDData(df, "y", "d", z_cols=[f"Z{i + 1}" for i in np.arange(4)], t_col="t") # check that after changing t_col, the t array gets updated t_comp = dml_data.data["t_new"].values @@ -349,7 +347,7 @@ def test_s_col_setter(): np.random.seed(3141) df = make_ssm_data(n_obs=100, return_type=pd.DataFrame) df["s_new"] = np.ones(shape=(100,)) - dml_data = DoubleMLData(df, "y", "d", [f"X{i + 1}" for i in np.arange(4)], s_col="s") + dml_data = DoubleMLSSMData(df, "y", "d", x_cols=[f"X{i + 1}" for i in np.arange(4)], s_col="s") # check that after changing s_col, the s array gets updated s_comp = dml_data.data["s_new"].values diff --git a/doubleml/did/datasets/dgp_did_SZ2020.py b/doubleml/did/datasets/dgp_did_SZ2020.py index ccfd4a80..0c0f31ab 100644 --- a/doubleml/did/datasets/dgp_did_SZ2020.py +++ b/doubleml/did/datasets/dgp_did_SZ2020.py @@ -2,13 +2,14 @@ import pandas as pd from scipy.linalg import toeplitz -from ...data.base_data import DoubleMLData +from ...data.did_data import DoubleMLDIDData from ...data.panel_data import DoubleMLPanelData -from ...utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias +from ...utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_did_data_alias, _get_dml_panel_data_alias _array_alias = _get_array_alias() _data_frame_alias = _get_data_frame_alias() -_dml_data_alias = _get_dml_data_alias() +_dml_did_data_alias = _get_dml_did_data_alias() +_dml_panel_data_alias = _get_dml_panel_data_alias() def _generate_features(n_obs, c, dim_x=4): @@ -60,7 +61,7 @@ def _f_ps(w, xi): return res -def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_type="DoubleMLData", **kwargs): +def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_type="DoubleMLDIDData", **kwargs): """ Generates data from a difference-in-differences model used in Sant'Anna and Zhao (2020). The data generating process is defined as follows. For a generic :math:`W=(W_1, W_2, W_3, W_4)^T`, let @@ -130,7 +131,7 @@ def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_ty cross_sectional_data : Indicates whether the setting is uses cross-sectional or panel data. Default value is ``False``. return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + If ``'DoubleMLDIDData'`` or ``DoubleMLDIDData``, returns a ``DoubleMLDIDData`` object. If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. @@ -181,13 +182,16 @@ def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_ty if return_type in _array_alias: return z, y, d, None - elif return_type in _data_frame_alias + _dml_data_alias: + elif return_type in _data_frame_alias + _dml_did_data_alias + _dml_panel_data_alias: z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] data = pd.DataFrame(np.column_stack((z, y, d)), columns=z_cols + ["y", "d"]) if return_type in _data_frame_alias: return data + elif return_type in _dml_did_data_alias: + return DoubleMLDIDData(data, "y", "d", z_cols=z_cols, t_col="t") else: - return DoubleMLData(data, "y", "d", z_cols) + assert return_type in _dml_panel_data_alias + return DoubleMLPanelData(data, "y", "d", t_col="t", id_col="id", x_cols=z_cols) elif return_type == "DoubleMLPanelData": z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] df0 = ( @@ -227,12 +231,15 @@ def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_ty if return_type in _array_alias: return z, y, d, t - elif return_type in _data_frame_alias + _dml_data_alias: + elif return_type in _data_frame_alias + _dml_did_data_alias + _dml_panel_data_alias: z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] data = pd.DataFrame(np.column_stack((z, y, d, t)), columns=z_cols + ["y", "d", "t"]) if return_type in _data_frame_alias: return data + elif return_type in _dml_did_data_alias: + return DoubleMLDIDData(data, "y", "d", z_cols=z_cols, t_col="t") else: - return DoubleMLData(data, "y", "d", z_cols, t_col="t") + assert return_type in _dml_panel_data_alias + return DoubleMLPanelData(data, "y", "d", t_col="t", id_col="id", x_cols=z_cols) else: raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py index a882c678..df2b4cbe 100644 --- a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py +++ b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py @@ -2,7 +2,7 @@ import pandas as pd from scipy.linalg import toeplitz -from doubleml.data import DoubleMLData +from doubleml.data import DoubleMLClusterData from doubleml.utils._aliases import _array_alias, _data_frame_alias, _dml_cluster_data_alias @@ -184,7 +184,9 @@ def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return y = d * theta + np.matmul(x, zeta_0) + eps cluster_cols = ["cluster_var_i", "cluster_var_j"] - cluster_vars = pd.MultiIndex.from_product([range(N), range(M)]).to_frame(name=cluster_cols).reset_index(drop=True) if return_type in _array_alias: + cluster_vars = pd.MultiIndex.from_product([range(N), range(M)]).to_frame(name=cluster_cols).reset_index(drop=True) + + if return_type in _array_alias: return x, y, d, cluster_vars.values, z elif return_type in _data_frame_alias + _dml_cluster_data_alias: x_cols = [f"X{i + 1}" for i in np.arange(dim_X)] @@ -192,6 +194,6 @@ def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return if return_type in _data_frame_alias: return data else: - return DoubleMLData(data, "Y", "D", x_cols, "Z", cluster_cols, is_cluster_data=True) + return DoubleMLClusterData(data, "Y", "D", cluster_cols, x_cols, "Z") else: raise ValueError("Invalid return_type.") diff --git a/doubleml/tests/test_exceptions_fixed.py b/doubleml/tests/test_exceptions_fixed.py new file mode 100644 index 00000000..e69de29b diff --git a/doubleml/tests/test_multiway_cluster.py b/doubleml/tests/test_multiway_cluster.py index c3425239..10e5d445 100644 --- a/doubleml/tests/test_multiway_cluster.py +++ b/doubleml/tests/test_multiway_cluster.py @@ -18,10 +18,9 @@ M = 25 # number of observations (second dimension) dim_x = 100 # dimension of x -(x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x, return_type="array") -obj_dml_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, z=z, cluster_vars=cluster_vars, is_cluster_data=True) +obj_dml_cluster_data = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x) -(x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021( +obj_dml_oneway_cluster_data = make_pliv_multiway_cluster_CKMS2021( N, M, dim_x, @@ -29,11 +28,9 @@ omega_epsilon=np.array([0.25, 0]), omega_v=np.array([0.25, 0]), omega_V=np.array([0.25, 0]), - return_type="array" ) -obj_dml_oneway_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, z=z, cluster_vars=cluster_vars, is_cluster_data=True) # only the first cluster variable is relevant with the weight setting above -obj_dml_oneway_cluster_data.cluster_cols = "cluster_var1" +obj_dml_oneway_cluster_data.cluster_cols = "cluster_var_i" @pytest.fixture( diff --git a/doubleml/tests/test_nonlinear_cluster.py b/doubleml/tests/test_nonlinear_cluster.py index 9a2c585a..71998941 100644 --- a/doubleml/tests/test_nonlinear_cluster.py +++ b/doubleml/tests/test_nonlinear_cluster.py @@ -7,7 +7,7 @@ from sklearn.linear_model import Lasso, LinearRegression import doubleml as dml -from doubleml import DoubleMLData +from doubleml import DoubleMLClusterData from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 from .test_nonlinear_score_mixin import DoubleMLPLRWithNonLinearScoreMixin @@ -20,7 +20,7 @@ # create data without insturment for plr x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x, return_type="array") -obj_dml_cluster_data = DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars, is_cluster_data=True) +obj_dml_cluster_data = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars) x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021( N, @@ -32,7 +32,7 @@ omega_V=np.array([0.25, 0]), return_type="array", ) -obj_dml_oneway_cluster_data = DoubleMLData.from_arrays(x, y, d, cluster_vars=cluster_vars, is_cluster_data=True) +obj_dml_oneway_cluster_data = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars) # only the first cluster variable is relevant with the weight setting above obj_dml_oneway_cluster_data.cluster_cols = "cluster_var1" @@ -188,14 +188,15 @@ def dml_plr_cluster_nonlinear_with_index(generate_data1, learner): # Set machine learning methods for m & l ml_l = clone(learner) - ml_m = clone(learner) + ml_m = clone(learner) + obj_dml_data = dml.DoubleMLData(data, "y", ["d"], x_cols) np.random.seed(3141) dml_plr_obj = DoubleMLPLRWithNonLinearScoreMixin(obj_dml_data, ml_l, ml_m, n_folds=n_folds) dml_plr_obj.fit() - + df = data.reset_index() - dml_cluster_data = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index", is_cluster_data=True) + dml_cluster_data = dml.DoubleMLClusterData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index") np.random.seed(3141) dml_plr_cluster_obj = DoubleMLPLRWithNonLinearScoreMixin(dml_cluster_data, ml_l, ml_m, n_folds=n_folds) dml_plr_cluster_obj.fit() diff --git a/doubleml/tests/test_return_types_fixed.py b/doubleml/tests/test_return_types_fixed.py new file mode 100644 index 00000000..e69de29b diff --git a/doubleml/tests/test_sensitivity_cluster.py b/doubleml/tests/test_sensitivity_cluster.py index a4b46e1a..83f8c270 100644 --- a/doubleml/tests/test_sensitivity_cluster.py +++ b/doubleml/tests/test_sensitivity_cluster.py @@ -17,7 +17,7 @@ (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x, return_type="array") -obj_dml_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, z=None, cluster_vars=cluster_vars, is_cluster_data=True) +obj_dml_cluster_data = dml.DoubleMLClusterData.from_arrays(x, y, d, cluster_vars) (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021( N, @@ -29,7 +29,7 @@ omega_V=np.array([0.25, 0]), return_type="array", ) -obj_dml_oneway_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, z=None, cluster_vars=cluster_vars, is_cluster_data=True) +obj_dml_oneway_cluster_data = dml.DoubleMLClusterData.from_arrays(x, y, d, cluster_vars) # only the first cluster variable is relevant with the weight setting above obj_dml_oneway_cluster_data.cluster_cols = "cluster_var1" diff --git a/doubleml/utils/_aliases.py b/doubleml/utils/_aliases.py index 679c80d3..b1dcaa21 100644 --- a/doubleml/utils/_aliases.py +++ b/doubleml/utils/_aliases.py @@ -1,13 +1,23 @@ import numpy as np import pandas as pd -from doubleml.data import DoubleMLData +from doubleml.data import ( + DoubleMLClusterData, + DoubleMLData, + DoubleMLDIDData, + DoubleMLPanelData, + DoubleMLRDDData, + DoubleMLSSMData, +) _array_alias = ["array", "np.ndarray", "np.array", np.ndarray] _data_frame_alias = ["DataFrame", "pd.DataFrame", pd.DataFrame] _dml_data_alias = ["DoubleMLData", DoubleMLData] -# For backwards compatibility, DoubleMLClusterData is now an alias for DoubleMLData with is_cluster_data=True -_dml_cluster_data_alias = ["DoubleMLClusterData", "DoubleMLData"] +_dml_did_data_alias = ["DoubleMLDIDData", DoubleMLDIDData] +_dml_panel_data_alias = ["DoubleMLPanelData", DoubleMLPanelData] +_dml_rdd_data_alias = ["DoubleMLRDDData", DoubleMLRDDData] +_dml_ssm_data_alias = ["DoubleMLSSMData", DoubleMLSSMData] +_dml_cluster_data_alias = ["DoubleMLClusterData", DoubleMLClusterData] def _get_array_alias(): @@ -28,3 +38,23 @@ def _get_dml_data_alias(): def _get_dml_cluster_data_alias(): """Returns the list of DoubleMLClusterData aliases.""" return _dml_cluster_data_alias + + +def _get_dml_did_data_alias(): + """Returns the list of DoubleMLDIDData aliases.""" + return _dml_did_data_alias + + +def _get_dml_panel_data_alias(): + """Returns the list of DoubleMLPanelData aliases.""" + return _dml_panel_data_alias + + +def _get_dml_rdd_data_alias(): + """Returns the list of DoubleMLRDDData aliases.""" + return _dml_rdd_data_alias + + +def _get_dml_ssm_data_alias(): + """Returns the list of DoubleMLSSMData aliases.""" + return _dml_ssm_data_alias diff --git a/doubleml/utils/_check_return_types_fixed.py b/doubleml/utils/_check_return_types_fixed.py new file mode 100644 index 00000000..e69de29b From 97abdd87dfba71d8c7c7f5fb9cd3b2bde66243e9 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 5 Jun 2025 15:59:55 +0200 Subject: [PATCH 25/84] update data backends --- doubleml/data/base_data.py | 41 ++++++++-- doubleml/data/did_data.py | 101 +++++++++++++++--------- doubleml/data/ssm_data.py | 89 +++++++++++++++------ doubleml/data/tests/test_dml_data.py | 52 +++++------- doubleml/did/datasets/dgp_did_SZ2020.py | 17 ++-- 5 files changed, 187 insertions(+), 113 deletions(-) diff --git a/doubleml/data/base_data.py b/doubleml/data/base_data.py index 8d585633..9ba8bc00 100644 --- a/doubleml/data/base_data.py +++ b/doubleml/data/base_data.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd from sklearn.utils import assert_all_finite +from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import check_array, check_consistent_length, column_or_1d from doubleml.utils._estimation import _assure_2d_array @@ -11,8 +12,9 @@ class DoubleMLBaseData(ABC): """Bas x_cols = [f"X{i + 1}" for i in np.arange(x.shape[1])] - # baseline version with features, outcome and treatments - data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + [y_col] + d_cols)Class Double machine learning data-backends""" + # baseline version with features, outcome and treatments + data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + [y_col] + d_cols)Class Double machine learning data-backends + """ def __init__(self, data): if not isinstance(data, pd.DataFrame): @@ -108,11 +110,11 @@ class DoubleMLData(DoubleMLBaseData): Default is ``None``. cluster_cols : None, str or list The cluster variable(s). Default is ``None``. - + use_other_treat_as_covariate : bool Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. Default is ``True``. - + is_cluster_data : bool Flag indicating whether this data object is being used for cluster data. Default is ``False``. @@ -191,7 +193,7 @@ def __str__(self): + "\n------------------ DataFrame info ------------------\n" + df_info ) - return res + return res def _data_summary_str(self): data_summary = ( @@ -204,7 +206,7 @@ def _data_summary_str(self): if self.cluster_cols is not None: data_summary += f"Cluster variable(s): {self.cluster_cols}\n" - if hasattr(self, 'is_cluster_data') and self.is_cluster_data: + if hasattr(self, "is_cluster_data") and self.is_cluster_data: data_summary += f"Is cluster data: {self.is_cluster_data}\n" data_summary += f"No. Observations: {self.n_obs}\n" return data_summary @@ -328,7 +330,7 @@ def from_arrays( else: d_cols = [f"d{i + 1}" for i in np.arange(d.shape[1])] - x_cols = [f"X{i + 1}" for i in np.arange(x.shape[1])] # baseline version with features, outcome and treatments + x_cols = [f"X{i + 1}" for i in np.arange(x.shape[1])] # baseline version with features, outcome and treatments data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + [y_col] + d_cols) if z is not None: df_z = pd.DataFrame(z, columns=z_cols) @@ -708,6 +710,31 @@ def _get_optional_col_sets(self): cluster_cols_set = set(self.cluster_cols or []) return [cluster_cols_set, z_cols_set] + def _check_binary_treats(self): + is_binary = pd.Series(dtype=bool, index=self.d_cols) + if not self.force_all_d_finite: + is_binary[:] = False # if we allow infinite values, we cannot check for binary + else: + for treatment_var in self.d_cols: + this_d = self.data.loc[:, treatment_var] + binary_treat = type_of_target(this_d) == "binary" + zero_one_treat = np.all((np.power(this_d, 2) - this_d) == 0) + is_binary[treatment_var] = binary_treat & zero_one_treat + return is_binary + + def _check_binary_outcome(self): + y = self.data.loc[:, self.y_col] + binary_outcome = type_of_target(y) == "binary" + zero_one_outcome = np.all((np.power(y, 2) - y) == 0) + is_binary = binary_outcome & zero_one_outcome + return is_binary + + @staticmethod + def _check_disjoint(set1, set2, name1, arg1, name2, arg2): + """Helper method to check for disjoint sets.""" + if not set1.isdisjoint(set2): + raise ValueError(f"At least one variable/column is set as {name1} ({arg1}) and {name2} ({arg2}).") + def _check_disjoint_sets(self): # this function can be extended in inherited subclasses self._check_disjoint_sets_y_d_x() diff --git a/doubleml/data/did_data.py b/doubleml/data/did_data.py index b528ead8..c7909b4e 100644 --- a/doubleml/data/did_data.py +++ b/doubleml/data/did_data.py @@ -1,9 +1,12 @@ import io import pandas as pd from sklearn.utils.validation import check_array +from sklearn.utils import assert_all_finite from doubleml.data.base_data import DoubleMLData from doubleml.utils._estimation import _assure_2d_array +from sklearn.utils.validation import check_array, check_consistent_length, column_or_1d +from sklearn.utils.multiclass import type_of_target class DoubleMLDIDData(DoubleMLData): @@ -80,10 +83,12 @@ def __init__( use_other_treat_as_covariate=True, force_all_x_finite=True, force_all_d_finite=True, - ): - # Initialize _t_col to None first to avoid AttributeError during parent init + ): # Initialize _t_col to None first to avoid AttributeError during parent init self._t_col = None - + + # Store whether x_cols was originally None to reset it later + x_cols_was_none = x_cols is None + # Call parent constructor first to set _data super().__init__( data=data, @@ -97,8 +102,25 @@ def __init__( force_all_d_finite=force_all_d_finite, ) - # Set time column after parent constructor (which sets _data) - self.t_col = t_col + # Set time column directly to avoid triggering checks during init + if t_col is not None: + if not isinstance(t_col, str): + raise TypeError( + "The time variable t_col must be of str type (or None). " + f"{str(t_col)} of type {str(type(t_col))} was passed." + ) + if t_col not in self.all_variables: + raise ValueError(f"Invalid time variable t_col. {t_col} is no data column.") + self._t_col = t_col + + # If x_cols was originally None, reset it to exclude the time column + if x_cols_was_none and t_col is not None: + self.x_cols = None + + # Now run the checks and set variables + if t_col is not None: + self._check_disjoint_sets() + self._set_y_z_t() # Set time variable array after data is loaded self._set_time_var() @@ -109,8 +131,8 @@ def from_arrays( x, y, d, - t, z=None, + t=None, cluster_vars=None, use_other_treat_as_covariate=True, force_all_x_finite=True, @@ -165,11 +187,13 @@ def from_arrays( >>> obj_dml_data_from_array = DoubleMLDIDData.from_arrays(x, y, d, t=t) """ # Prepare time variable - t = check_array(t, ensure_2d=False, allow_nd=False) - t = _assure_2d_array(t) - if t.shape[1] != 1: - raise ValueError("t must be a single column.") - t_col = "t" + + if t is None: + t_col = None + else: + t = column_or_1d(t, warn=True) + check_consistent_length(x, y, d, t) + t_col = "t" # Create base data using parent class method base_data = DoubleMLData.from_arrays( @@ -179,13 +203,16 @@ def from_arrays( # Add time variable to the DataFrame data = pd.concat((base_data.data, pd.DataFrame(t, columns=[t_col])), axis=1) + if t is not None: + data[t_col] = t + return cls( data, base_data.y_col, base_data.d_cols, - t_col, base_data.x_cols, base_data.z_cols, + t_col, base_data.cluster_cols, base_data.use_other_treat_as_covariate, base_data.force_all_x_finite, @@ -199,30 +226,6 @@ def t_col(self): """ return self._t_col - @t_col.setter - def t_col(self, value): - if not isinstance(value, str): - raise TypeError( - "The time variable t_col must be of str type. " f"{str(value)} of type {str(type(value))} was passed." - ) - # Check if data exists (during initialization it might not) - if hasattr(self, "_data") and value not in self.all_variables: - raise ValueError("Invalid time variable t_col. The time variable is no data column.") - self._t_col = value - # Update time variable array if data is already loaded - if hasattr(self, "_data"): - self._set_time_var() - - @property - def t(self): - """ - Array of time variable. - """ - if self.t_col is not None: - return self._t.values - else: - return None - @t_col.setter def t_col(self, value): reset_value = hasattr(self, "_t_col") @@ -239,8 +242,18 @@ def t_col(self, value): self._t_col = None if reset_value: self._check_disjoint_sets() - self._set_y_z_t_s() + self._set_y_z_t() + + @property + def t(self): + """ + Array of time variable. + """ + if self.t_col is not None: + return self._t.values + else: + return None def _get_optional_col_sets(self): """Get optional column sets including time column.""" @@ -254,7 +267,8 @@ def _check_disjoint_sets(self): """Check that time column doesn't overlap with other variables.""" # Apply standard checks from parent class super()._check_disjoint_sets() - self._check_disjoint_sets_t_col() + if self.t_col is not None: + self._check_disjoint_sets_t_col() def _check_disjoint_sets_t_col(self): """Check that time column is disjoint from other variable sets.""" @@ -287,6 +301,17 @@ def _set_time_var(self): if hasattr(self, "_data") and self.t_col in self.data.columns: self._t = self.data.loc[:, [self.t_col]] + def _set_y_z_t(self): + def _set_attr(col): + if col is None: + return None + assert_all_finite(self.data.loc[:, col]) + return self.data.loc[:, col] + + self._y = _set_attr(self.y_col) + self._z = _set_attr(self.z_cols) + self._t = _set_attr(self.t_col) + def __str__(self): """String representation.""" data_summary = self._data_summary_str() diff --git a/doubleml/data/ssm_data.py b/doubleml/data/ssm_data.py index 301a4234..425fbde5 100644 --- a/doubleml/data/ssm_data.py +++ b/doubleml/data/ssm_data.py @@ -1,6 +1,7 @@ import io import pandas as pd from sklearn.utils.validation import check_array +from sklearn.utils import assert_all_finite from doubleml.data.base_data import DoubleMLData from doubleml.utils._estimation import _assure_2d_array @@ -66,8 +67,7 @@ class DoubleMLSSMData(DoubleMLData): >>> df = make_ssm_data(return_type='DataFrame') >>> obj_dml_data_from_df = DoubleMLSSMData(df, 'y', 'd', 's') >>> # initialization from np.ndarray - >>> (x, y, d, s) = make_ssm_data(return_type='array') - >>> obj_dml_data_from_array = DoubleMLSSMData.from_arrays(x, y, d, s=s) + >>> (x, y, d, s) = make_ssm_data(return_type='array') >>> obj_dml_data_from_array = DoubleMLSSMData.from_arrays(x, y, d, s=s) """ def __init__( @@ -83,8 +83,11 @@ def __init__( force_all_x_finite=True, force_all_d_finite=True, ): - # Set selection column before calling parent constructor - self.s_col = s_col + # Initialize _s_col to None first to avoid AttributeError during parent init + self._s_col = None + + # Store whether x_cols was originally None to reset it later + x_cols_was_none = x_cols is None # Call parent constructor super().__init__( @@ -99,6 +102,26 @@ def __init__( force_all_d_finite=force_all_d_finite, ) + # Set selection column directly to avoid triggering checks during init + if s_col is not None: + if not isinstance(s_col, str): + raise TypeError( + "The selection variable s_col must be of str type (or None). " + f"{str(s_col)} of type {str(type(s_col))} was passed." + ) + if s_col not in self.all_variables: + raise ValueError(f"Invalid selection variable s_col. {s_col} is no data column.") + self._s_col = s_col + + # If x_cols was originally None, reset it to exclude the selection column + if x_cols_was_none and s_col is not None: + self.x_cols = None + + # Now run the checks and set variables + if s_col is not None: + self._check_disjoint_sets() + self._set_y_z_s() + # Set selection variable array after data is loaded self._set_selection_var() @@ -193,6 +216,14 @@ def from_arrays( ) @property + def s(self): + """ + Array of score or selection variable. + """ + if self.s_col is not None: + return self._s.values + else: + return None @property def s_col(self): """ The selection variable. @@ -201,30 +232,29 @@ def s_col(self): @s_col.setter def s_col(self, value): - if not isinstance(value, str): - raise TypeError( - "The selection variable s_col must be of str type. " f"{str(value)} of type {str(type(value))} was passed." - ) - # Check if data exists (during initialization it might not) - if hasattr(self, "_data") and value not in self.all_variables: - raise ValueError("Invalid selection variable s_col. The selection variable is no data column.") - self._s_col = value - # Update selection variable array if data is already loaded - if hasattr(self, "_data"): - self._set_selection_var() - - @property - def s(self): - """ - Array of selection variable. - """ - return self._s.values + reset_value = hasattr(self, "_s_col") + if value is not None: + if not isinstance(value, str): + raise TypeError( + "The selection variable s_col must be of str type (or None). " + f"{str(value)} of type {str(type(value))} was passed." + ) + if value not in self.all_variables: + raise ValueError(f"Invalid selection variable s_col. {value} is no data column.") + self._s_col = value + else: + self._s_col = None + if reset_value: + self._check_disjoint_sets() + self._set_y_z_s() def _get_optional_col_sets(self): """Get optional column sets including selection column.""" base_optional_col_sets = super()._get_optional_col_sets() - s_col_set = {self.s_col} - return [s_col_set] + base_optional_col_sets + if self.s_col is not None: + s_col_set = {self.s_col} + return [s_col_set] + base_optional_col_sets + return base_optional_col_sets def _check_disjoint_sets(self): """Check that selection column doesn't overlap with other variables.""" @@ -263,6 +293,17 @@ def _set_selection_var(self): if hasattr(self, "_data") and self.s_col in self.data.columns: self._s = self.data.loc[:, [self.s_col]] + def _set_y_z_s(self): + def _set_attr(col): + if col is None: + return None + assert_all_finite(self.data.loc[:, col]) + return self.data.loc[:, col] + + self._y = _set_attr(self.y_col) + self._z = _set_attr(self.z_cols) + self._s = _set_attr(self.s_col) + def __str__(self): """String representation.""" data_summary = self._data_summary_str() diff --git a/doubleml/data/tests/test_dml_data.py b/doubleml/data/tests/test_dml_data.py index 7dc94fd4..a84e4e77 100644 --- a/doubleml/data/tests/test_dml_data.py +++ b/doubleml/data/tests/test_dml_data.py @@ -67,7 +67,7 @@ def test_obj_vs_from_arrays(): dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols] ) - assert dml_data_from_array.data.equals(dml_data.data) + assert np.array_equal(dml_data_from_array.data, dml_data.data) dml_data = _make_pliv_data(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays( @@ -76,7 +76,7 @@ def test_obj_vs_from_arrays(): dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols], ) - assert dml_data_from_array.data.equals(dml_data.data) + assert np.array_equal(dml_data_from_array.data, dml_data.data) dml_data = make_pliv_CHS2015(n_obs=100, dim_z=5) dml_data_from_array = DoubleMLData.from_arrays( @@ -191,32 +191,22 @@ def test_x_cols_setter_defaults(): # without instrument with time df = pd.DataFrame(np.tile(np.arange(5), (4, 1)), columns=["yy", "dd", "xx1", "xx2", "tt"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", t_col="tt") + dml_data = DoubleMLDIDData(df, y_col="yy", d_cols="dd", t_col="tt") assert dml_data.x_cols == ["xx1", "xx2"] # with instrument with time df = pd.DataFrame(np.tile(np.arange(6), (4, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", z_cols="zz", t_col="tt") + dml_data = DoubleMLDIDData(df, y_col="yy", d_cols="dd", z_cols="zz", t_col="tt") assert dml_data.x_cols == ["xx1", "xx2"] # without instrument with selection df = pd.DataFrame(np.tile(np.arange(5), (4, 1)), columns=["yy", "dd", "xx1", "xx2", "ss"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", s_col="ss") + dml_data = DoubleMLSSMData(df, y_col="yy", d_cols="dd", s_col="ss") assert dml_data.x_cols == ["xx1", "xx2"] # with instrument with selection df = pd.DataFrame(np.tile(np.arange(6), (4, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "ss"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", z_cols="zz", s_col="ss") - assert dml_data.x_cols == ["xx1", "xx2"] - - # with selection and time - df = pd.DataFrame(np.tile(np.arange(6), (4, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "ss"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", t_col="tt", s_col="ss") - assert dml_data.x_cols == ["xx1", "xx2"] - - # with instrument, selection and time - df = pd.DataFrame(np.tile(np.arange(7), (4, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "ss"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", z_cols="zz", t_col="tt", s_col="ss") + dml_data = DoubleMLSSMData(df, y_col="yy", d_cols="dd", z_cols="zz", s_col="ss") assert dml_data.x_cols == ["xx1", "xx2"] @@ -322,7 +312,7 @@ def test_t_col_setter(): np.random.seed(3141) df = make_did_SZ2020(n_obs=100, cross_sectional_data=True, return_type=pd.DataFrame) df["t_new"] = np.ones(shape=(100,)) - dml_data = DoubleMLDIDData(df, "y", "d", z_cols=[f"Z{i + 1}" for i in np.arange(4)], t_col="t") + dml_data = DoubleMLDIDData(df, "y", "d", x_cols=[f"Z{i + 1}" for i in np.arange(4)], t_col="t") # check that after changing t_col, the t array gets updated t_comp = dml_data.data["t_new"].values @@ -460,42 +450,38 @@ def test_disjoint_sets(): # time variable msg = r"At least one variable/column is set as outcome variable \(``y_col``\) and time variable \(``t_col``\)." with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="yy") + _ = DoubleMLDIDData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="yy") msg = r"At least one variable/column is set as treatment variable \(``d_cols``\) and time variable \(``t_col``\)." with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="dd1") + _ = DoubleMLDIDData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="dd1") msg = r"At least one variable/column is set as covariate \(``x_cols``\) and time variable \(``t_col``\)." with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="xx2") + _ = DoubleMLDIDData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="xx2") msg = r"At least one variable/column is set as instrumental variable \(``z_cols``\) and time variable \(``t_col``\)." with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="zz", t_col="zz") + _ = DoubleMLDIDData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="zz", t_col="zz") # score or selection variable msg = ( - r"At least one variable/column is set as outcome variable \(``y_col``\) and score or selection variable \(``s_col``\)." + r"At least one variable/column is set as outcome variable \(``y_col``\) and selection variable \(``s_col``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="yy") + _ = DoubleMLSSMData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="yy") msg = ( r"At least one variable/column is set as treatment variable \(``d_cols``\) " - r"and score or selection variable \(``s_col``\)." + r"and selection variable \(``s_col``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="dd1") - msg = r"At least one variable/column is set as covariate \(``x_cols``\) and score or selection variable \(``s_col``\)." + _ = DoubleMLSSMData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="dd1") + msg = r"At least one variable/column is set as covariate \(``x_cols``\) and selection variable \(``s_col``\)." with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="xx2") + _ = DoubleMLSSMData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="xx2") msg = ( r"At least one variable/column is set as instrumental variable \(``z_cols``\) " - r"and score or selection variable \(``s_col``\)." + r"and selection variable \(``s_col``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="zz", s_col="zz") - msg = r"At least one variable/column is set as time variable \(``t_col``\) and score or selection variable \(``s_col``\)." - with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", s_col="tt") - + _ = DoubleMLSSMData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="zz", s_col="zz") @pytest.mark.ci def test_duplicates(): diff --git a/doubleml/did/datasets/dgp_did_SZ2020.py b/doubleml/did/datasets/dgp_did_SZ2020.py index 0c0f31ab..db82b032 100644 --- a/doubleml/did/datasets/dgp_did_SZ2020.py +++ b/doubleml/did/datasets/dgp_did_SZ2020.py @@ -12,6 +12,7 @@ _dml_panel_data_alias = _get_dml_panel_data_alias() + def _generate_features(n_obs, c, dim_x=4): cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=n_obs) @@ -182,16 +183,13 @@ def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_ty if return_type in _array_alias: return z, y, d, None - elif return_type in _data_frame_alias + _dml_did_data_alias + _dml_panel_data_alias: + elif return_type in _data_frame_alias + _dml_did_data_alias: z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] data = pd.DataFrame(np.column_stack((z, y, d)), columns=z_cols + ["y", "d"]) if return_type in _data_frame_alias: return data - elif return_type in _dml_did_data_alias: - return DoubleMLDIDData(data, "y", "d", z_cols=z_cols, t_col="t") else: - assert return_type in _dml_panel_data_alias - return DoubleMLPanelData(data, "y", "d", t_col="t", id_col="id", x_cols=z_cols) + return DoubleMLDIDData(data, "y", "d", x_cols=z_cols) elif return_type == "DoubleMLPanelData": z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] df0 = ( @@ -231,15 +229,12 @@ def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_ty if return_type in _array_alias: return z, y, d, t - elif return_type in _data_frame_alias + _dml_did_data_alias + _dml_panel_data_alias: + elif return_type in _data_frame_alias + _dml_did_data_alias: z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] data = pd.DataFrame(np.column_stack((z, y, d, t)), columns=z_cols + ["y", "d", "t"]) if return_type in _data_frame_alias: return data elif return_type in _dml_did_data_alias: - return DoubleMLDIDData(data, "y", "d", z_cols=z_cols, t_col="t") - else: - assert return_type in _dml_panel_data_alias - return DoubleMLPanelData(data, "y", "d", t_col="t", id_col="id", x_cols=z_cols) + return DoubleMLDIDData(data, "y", "d", x_cols=z_cols, t_col="t") else: - raise ValueError("Invalid return_type.") + raise ValueError("Invalid return_type.") \ No newline at end of file From 9f6f5d432a9c259344cab0dcc80fc19a7af5ac35 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Thu, 5 Jun 2025 16:10:57 +0200 Subject: [PATCH 26/84] add _n_obs_sample_splitting property to doubleml class --- doubleml/double_ml.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index c2d3727b..58b8692a 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -98,6 +98,7 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): # perform sample splitting self._smpls = None self._smpls_cluster = None + self._n_obs_sample_splitting = self.n_obs if draw_sample_splitting: self.draw_sample_splitting() @@ -1200,37 +1201,30 @@ def evaluate_learners(self, learners=None, metric=_rmse): f"The learners have to be a subset of {str(self.params_names)}. Learners {str(learners)} provided." ) - def draw_sample_splitting(self, n_obs=None): + def draw_sample_splitting(self): """ Draw sample splitting for DoubleML models. The samples are drawn according to the attributes ``n_folds`` and ``n_rep``. - Parameters - ---------- - n_obs : int or None - The number of observations to resample. If ``None``, the number of observations is set to the number - of observations in the data set. - Returns ------- self : object """ - if n_obs is None: - n_obs = self.n_obs - if self._is_cluster_data: obj_dml_resampling = DoubleMLClusterResampling( n_folds=self._n_folds_per_cluster, n_rep=self.n_rep, - n_obs=n_obs, + n_obs=self._n_obs_sample_splitting, n_cluster_vars=self._dml_data.n_cluster_vars, cluster_vars=self._dml_data.cluster_vars, ) self._smpls, self._smpls_cluster = obj_dml_resampling.split_samples() else: - obj_dml_resampling = DoubleMLResampling(n_folds=self.n_folds, n_rep=self.n_rep, n_obs=n_obs, stratify=self._strata) + obj_dml_resampling = DoubleMLResampling( + n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._n_obs_sample_splitting, stratify=self._strata + ) self._smpls = obj_dml_resampling.split_samples() return self From b96a8392392e21387243698ffe5505205d667fda Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 5 Jun 2025 16:22:34 +0200 Subject: [PATCH 27/84] some progress on refactoring the data backends. --- doubleml/data/ssm_data.py | 11 +++++++---- doubleml/data/tests/test_dml_data.py | 4 ++-- doubleml/irm/datasets/dgp_ssm_data.py | 14 +++++++------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/doubleml/data/ssm_data.py b/doubleml/data/ssm_data.py index 425fbde5..91c50bb0 100644 --- a/doubleml/data/ssm_data.py +++ b/doubleml/data/ssm_data.py @@ -75,9 +75,9 @@ def __init__( data, y_col, d_cols, - s_col, x_cols=None, z_cols=None, + s_col=None, cluster_cols=None, use_other_treat_as_covariate=True, force_all_x_finite=True, @@ -131,8 +131,8 @@ def from_arrays( x, y, d, - s, z=None, + s=None, cluster_vars=None, use_other_treat_as_covariate=True, force_all_x_finite=True, @@ -206,9 +206,9 @@ def from_arrays( data, base_data.y_col, base_data.d_cols, - s_col, base_data.x_cols, base_data.z_cols, + s_col, base_data.cluster_cols, base_data.use_other_treat_as_covariate, base_data.force_all_x_finite, @@ -223,7 +223,10 @@ def s(self): if self.s_col is not None: return self._s.values else: - return None @property + return None + + + @property def s_col(self): """ The selection variable. diff --git a/doubleml/data/tests/test_dml_data.py b/doubleml/data/tests/test_dml_data.py index a84e4e77..af09e89e 100644 --- a/doubleml/data/tests/test_dml_data.py +++ b/doubleml/data/tests/test_dml_data.py @@ -344,11 +344,11 @@ def test_s_col_setter(): dml_data.s_col = "s_new" assert np.array_equal(dml_data.s, s_comp) - msg = r"Invalid score or selection variable s_col. a13 is no data column." + msg = r"Invalid selection variable s_col. a13 is no data column." with pytest.raises(ValueError, match=msg): dml_data.s_col = "a13" - msg = r"The score or selection variable s_col must be of str type \(or None\). " "5 of type was passed." + msg = r"The selection variable s_col must be of str type \(or None\). " "5 of type was passed." with pytest.raises(TypeError, match=msg): dml_data.s_col = 5 diff --git a/doubleml/irm/datasets/dgp_ssm_data.py b/doubleml/irm/datasets/dgp_ssm_data.py index 6a6a5bee..51a33c3a 100644 --- a/doubleml/irm/datasets/dgp_ssm_data.py +++ b/doubleml/irm/datasets/dgp_ssm_data.py @@ -2,15 +2,15 @@ import pandas as pd from scipy.linalg import toeplitz -from doubleml.data import DoubleMLData -from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias +from doubleml.data import DoubleMLSSMData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_ssm_data_alias _array_alias = _get_array_alias() _data_frame_alias = _get_data_frame_alias() -_dml_data_alias = _get_dml_data_alias() +_dml_ssm_data_alias = _get_dml_ssm_data_alias() -def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type="DoubleMLData"): +def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type="DoubleMLSSMData"): """ Generates data from a sample selection model (SSM). The data generating process is defined as @@ -86,7 +86,7 @@ def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type="DoubleM if return_type in _array_alias: return x, y, d, z, s - elif return_type in _data_frame_alias + _dml_data_alias: + elif return_type in _data_frame_alias + _dml_ssm_data_alias: x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] if mar: data = pd.DataFrame(np.column_stack((x, y, d, s)), columns=x_cols + ["y", "d", "s"]) @@ -96,7 +96,7 @@ def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type="DoubleM return data else: if mar: - return DoubleMLData(data, "y", "d", x_cols, None, None, "s") - return DoubleMLData(data, "y", "d", x_cols, "z", None, "s") + return DoubleMLSSMData(data, "y", "d", x_cols, z_cols=None, s_col="s") + return DoubleMLSSMData(data, "y", "d", x_cols, z_cols="z", s_col="s") else: raise ValueError("Invalid return_type.") From eb951c40ba6c6ed58854a1f6bede79411e2c7efb Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Thu, 5 Jun 2025 16:49:51 +0200 Subject: [PATCH 28/84] update check_resampling input --- doubleml/double_ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 58b8692a..d2a7a641 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -1290,7 +1290,7 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): >>> dml_plr_obj.set_sample_splitting(smpls) """ self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting( - all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data, n_obs=self.n_obs + all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data, n_obs=self._n_obs_sample_splitting ) # set sample splitting can update the number of repetitions From a6c6507fabe396ac084c1d9825b2fdf6a7850e33 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Thu, 5 Jun 2025 16:50:01 +0200 Subject: [PATCH 29/84] update did binary classes with n_obs_subset and n_obs_sample_splitting --- doubleml/did/did_binary.py | 12 ++++++------ doubleml/did/did_cs_binary.py | 19 +++++-------------- .../tests/test_did_binary_control_groups.py | 2 +- .../test_did_binary_external_predictions.py | 2 +- .../did/tests/test_did_binary_vs_did_panel.py | 2 +- 5 files changed, 14 insertions(+), 23 deletions(-) diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py index e4d309db..a4876f74 100644 --- a/doubleml/did/did_binary.py +++ b/doubleml/did/did_binary.py @@ -171,8 +171,7 @@ def __init__( # Numeric values for positions of the entries in id_panel_data inside id_original # np.nonzero(np.isin(id_original, id_panel_data)) - self._n_subset = self._panel_data_wide.shape[0] - self._n_obs = self._n_subset # Effective sample size used for resampling + self._n_obs_subset = self._panel_data_wide.shape[0] # Effective sample size used for resampling self._n_treated_subset = self._panel_data_wide["G_indicator"].sum() # Save x and y for later ML estimation @@ -192,6 +191,7 @@ def __init__( # set stratication for resampling self._strata = self._panel_data_wide["G_indicator"] + self._n_obs_sample_splitting = self.n_obs_subset if draw_sample_splitting: self.draw_sample_splitting() @@ -244,7 +244,7 @@ def __str__(self): f"Evaluation period: {str(self.t_value_eval)}\n" f"Control group: {str(self.control_group)}\n" f"Anticipation periods: {str(self.anticipation_periods)}\n" - f"Effective sample size: {str(self.n_obs)}\n" + f"Effective sample size: {str(self.n_obs_subset)}\n" ) learner_info = "" for key, value in self.learner.items(): @@ -371,11 +371,11 @@ def trimming_threshold(self): return self._trimming_threshold @property - def n_obs(self): + def n_obs_subset(self): """ The number of observations used for estimation. """ - return self._n_subset + return self._n_obs_subset def _initialize_ml_nuisance_params(self): if self.score == "observational": @@ -713,7 +713,7 @@ def _sensitivity_element_est(self, preds): } # add scaling to make variance estimation consistent (sample size difference) - scaling = self._dml_data.n_obs / self._n_subset + scaling = self._dml_data.n_obs / self._n_obs_subset element_dict = { "sigma2": sigma2, "nu2": nu2, diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index e550eb60..d571e107 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -53,16 +53,7 @@ def __init__( self._n_obs = obj_dml_data.data.shape[0] self._score_dim = (self._n_obs, self.n_rep, self._dml_data.n_treat) # reinitialze arrays - ( - self._psi, - self._psi_deriv, - self._psi_elements, - self._var_scaling_factors, - self._coef, - self._se, - self._all_coef, - self._all_se, - ) = self._initialize_arrays() + self._initialize_arrays() self._check_data(self._dml_data) g_values = self._dml_data.g_values @@ -108,8 +99,7 @@ def __init__( # Numeric values for positions of the entries in id_panel_data inside id_original # np.nonzero(np.isin(id_original, id_panel_data)) - self._n_subset = self.data_subset.shape[0] - self._n_obs = self._n_subset # Effective sample size used for resampling + self._n_obs_subset = self.data_subset.shape[0] # Effective sample size used for resampling # Save x and y for later ML estimation self._x_data = self.data_subset.loc[:, self._dml_data.x_cols].values @@ -129,6 +119,7 @@ def __init__( # set stratication for resampling self._strata = self.data_subset["G_indicator"] + 2 * self.data_subset["t_indicator"] + self._n_obs_sample_splitting = self.n_obs_subset if draw_sample_splitting: self.draw_sample_splitting() @@ -255,11 +246,11 @@ def trimming_threshold(self): return self._trimming_threshold @property - def n_obs(self): + def n_obs_subset(self): """ The number of observations used for estimation. """ - return self._n_subset + return self._n_obs_subset def _initialize_ml_nuisance_params(self): if self.score == "observational": diff --git a/doubleml/did/tests/test_did_binary_control_groups.py b/doubleml/did/tests/test_did_binary_control_groups.py index b8406b15..627cf50a 100644 --- a/doubleml/did/tests/test_did_binary_control_groups.py +++ b/doubleml/did/tests/test_did_binary_control_groups.py @@ -21,7 +21,7 @@ def test_control_groups_different(): dml_did_never_treated = dml.did.DoubleMLDIDBinary(control_group="never_treated", **args) dml_did_not_yet_treated = dml.did.DoubleMLDIDBinary(control_group="not_yet_treated", **args) - assert dml_did_never_treated._n_subset != dml_did_not_yet_treated._n_subset + assert dml_did_never_treated.n_obs_subset != dml_did_not_yet_treated.n_obs_subset # same treatment group assert dml_did_never_treated._n_treated_subset == dml_did_not_yet_treated._n_treated_subset diff --git a/doubleml/did/tests/test_did_binary_external_predictions.py b/doubleml/did/tests/test_did_binary_external_predictions.py index ccc136d0..0cb3e055 100644 --- a/doubleml/did/tests/test_did_binary_external_predictions.py +++ b/doubleml/did/tests/test_did_binary_external_predictions.py @@ -112,7 +112,7 @@ def doubleml_did_panel_fixture(did_score, n_rep): } dml_did = DoubleMLDIDBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs) - all_smpls = draw_smpls(n_obs=dml_did._n_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_panel) + all_smpls = draw_smpls(n_obs=dml_did.n_obs_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_panel) dml_did.set_sample_splitting(all_smpls) np.random.seed(3141) diff --git a/doubleml/did/tests/test_did_binary_vs_did_panel.py b/doubleml/did/tests/test_did_binary_vs_did_panel.py index 1eacdf6a..7d1dc947 100644 --- a/doubleml/did/tests/test_did_binary_vs_did_panel.py +++ b/doubleml/did/tests/test_did_binary_vs_did_panel.py @@ -178,7 +178,7 @@ def test_sensitivity_elements(dml_did_binary_vs_did_fixture): ) for sensitivity_element in ["psi_sigma2", "psi_nu2", "riesz_rep"]: dml_binary_obj = dml_did_binary_vs_did_fixture["dml_did_binary_obj"] - scaling = dml_binary_obj._n_subset / dml_binary_obj._dml_data.n_obs + scaling = dml_binary_obj.n_obs_subset / dml_binary_obj._dml_data.n_obs binary_sensitivity_element = scaling * _get_id_positions( dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element], dml_binary_obj._id_positions ) From d54b272235261a090792990b180c1b74b4e861da Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Jun 2025 09:19:43 +0200 Subject: [PATCH 30/84] update tune without folds to n_obs of doubleml obj --- doubleml/double_ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index d2a7a641..88f677ef 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -848,7 +848,7 @@ def tune( self.set_ml_nuisance_params(nuisance_model, self._dml_data.d_cols[i_d], params) else: - smpls = [(np.arange(self._dml_data.n_obs), np.arange(self._dml_data.n_obs))] + smpls = [(np.arange(self.n_obs), np.arange(self.n_obs))] # tune hyperparameters res = self._nuisance_tuning( smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search From 693e109bd65d6cb0987c9de2363266cf48c61d32 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Jun 2025 09:20:31 +0200 Subject: [PATCH 31/84] change n_obs for panel data add n_ids for did_binary obj --- doubleml/data/panel_data.py | 4 +-- doubleml/data/tests/test_panel_data.py | 5 ++-- doubleml/did/did_binary.py | 12 ++++++-- doubleml/did/did_cs_binary.py | 5 ---- .../did/tests/test_did_binary_vs_did_panel.py | 2 +- .../test_did_multi_external_predictions.py | 7 +++++ .../did/tests/test_did_multi_return_types.py | 5 ++-- doubleml/did/tests/test_return_types.py | 13 ++++++-- doubleml/utils/_check_return_types.py | 30 ++++++------------- 9 files changed, 45 insertions(+), 38 deletions(-) diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py index 4e416183..59ad531c 100644 --- a/doubleml/data/panel_data.py +++ b/doubleml/data/panel_data.py @@ -215,9 +215,9 @@ def id_var_unique(self): return self._id_var_unique @property - def n_obs(self): + def n_ids(self): """ - The number of observations. For panel data, the number of unique values for id_col. + The number of unique values for id_col. """ return len(self._id_var_unique) diff --git a/doubleml/data/tests/test_panel_data.py b/doubleml/data/tests/test_panel_data.py index 2f2250ba..e1a7c925 100644 --- a/doubleml/data/tests/test_panel_data.py +++ b/doubleml/data/tests/test_panel_data.py @@ -56,7 +56,7 @@ def test_id_col_setter(): dml_data.id_col = "id_new" assert np.array_equal(dml_data.id_var, id_comp) assert dml_data._id_var_unique == np.unique(id_comp) - assert dml_data.n_obs == 1 + assert dml_data.n_ids == 1 msg = "Invalid id variable id_col. a13 is no data column." with pytest.raises(ValueError, match=msg): @@ -169,7 +169,8 @@ def test_panel_data_properties(): assert np.array_equal(dml_data.id_var, df["id"].values) assert np.array_equal(dml_data.id_var_unique, np.unique(df["id"].values)) - assert dml_data.n_obs == len(np.unique(df["id"].values)) + assert dml_data.n_obs == df.shape[0] + assert dml_data.n_ids == len(np.unique(df["id"].values)) assert dml_data.g_col == "d" assert np.array_equal(dml_data.g_values, np.sort(np.unique(df["d"].values))) assert dml_data.n_groups == len(np.unique(df["d"].values)) diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py index a4876f74..a9939c97 100644 --- a/doubleml/did/did_binary.py +++ b/doubleml/did/did_binary.py @@ -124,6 +124,12 @@ def __init__( super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting=False) self._check_data(self._dml_data) + # for did panel data the scores are based on the number of unique ids + self._n_obs = obj_dml_data.n_ids + self._score_dim = (self._n_obs, self.n_rep, self._dml_data.n_treat) + # reinitialze arrays + self._initialize_arrays() + g_values = self._dml_data.g_values t_values = self._dml_data.t_values @@ -542,7 +548,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa psi_a, psi_b = self._score_elements(y, d, g_hat0["preds"], g_hat1["preds"], m_hat["preds"], p_hat) extend_kwargs = { - "n_obs": self._dml_data.n_obs, + "n_obs": self._dml_data.n_ids, "id_positions": self.id_positions, } psi_elements = { @@ -707,13 +713,13 @@ def _sensitivity_element_est(self, preds): psi_nu2 = nu2_score_element - nu2 extend_kwargs = { - "n_obs": self._dml_data.n_obs, + "n_obs": self._dml_data.n_ids, "id_positions": self.id_positions, "fill_value": 0.0, } # add scaling to make variance estimation consistent (sample size difference) - scaling = self._dml_data.n_obs / self._n_obs_subset + scaling = self._dml_data.n_ids / self._n_obs_subset element_dict = { "sigma2": sigma2, "nu2": nu2, diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index d571e107..e1786242 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -50,11 +50,6 @@ def __init__( ): super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting=False) - self._n_obs = obj_dml_data.data.shape[0] - self._score_dim = (self._n_obs, self.n_rep, self._dml_data.n_treat) - # reinitialze arrays - self._initialize_arrays() - self._check_data(self._dml_data) g_values = self._dml_data.g_values t_values = self._dml_data.t_values diff --git a/doubleml/did/tests/test_did_binary_vs_did_panel.py b/doubleml/did/tests/test_did_binary_vs_did_panel.py index 7d1dc947..9da81739 100644 --- a/doubleml/did/tests/test_did_binary_vs_did_panel.py +++ b/doubleml/did/tests/test_did_binary_vs_did_panel.py @@ -178,7 +178,7 @@ def test_sensitivity_elements(dml_did_binary_vs_did_fixture): ) for sensitivity_element in ["psi_sigma2", "psi_nu2", "riesz_rep"]: dml_binary_obj = dml_did_binary_vs_did_fixture["dml_did_binary_obj"] - scaling = dml_binary_obj.n_obs_subset / dml_binary_obj._dml_data.n_obs + scaling = dml_binary_obj.n_obs_subset / dml_binary_obj._dml_data.n_ids binary_sensitivity_element = scaling * _get_id_positions( dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element], dml_binary_obj._id_positions ) diff --git a/doubleml/did/tests/test_did_multi_external_predictions.py b/doubleml/did/tests/test_did_multi_external_predictions.py index 2e7003f9..e336487d 100644 --- a/doubleml/did/tests/test_did_multi_external_predictions.py +++ b/doubleml/did/tests/test_did_multi_external_predictions.py @@ -100,3 +100,10 @@ def test_coef(doubleml_did_multi_ext_fixture): assert math.isclose( doubleml_did_multi_ext_fixture["coef"], doubleml_did_multi_ext_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3 ) + + +@pytest.mark.ci +def test_se(doubleml_did_multi_ext_fixture): + assert math.isclose( + doubleml_did_multi_ext_fixture["se"], doubleml_did_multi_ext_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3 + ) diff --git a/doubleml/did/tests/test_did_multi_return_types.py b/doubleml/did/tests/test_did_multi_return_types.py index 2e12ce10..c11544ed 100644 --- a/doubleml/did/tests/test_did_multi_return_types.py +++ b/doubleml/did/tests/test_did_multi_return_types.py @@ -17,6 +17,7 @@ N_REP = 1 N_FOLDS = 3 N_REP_BOOT = 314 +N_PERIODS = 5 dml_args = { "n_rep": N_REP, @@ -30,7 +31,7 @@ datasets = {} # panel data -df_panel = make_did_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=5, time_type="float") +df_panel = make_did_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=N_PERIODS, time_type="float") df_panel["y_binary"] = np.random.binomial(n=1, p=0.5, size=df_panel.shape[0]) datasets["did_panel"] = DoubleMLPanelData( df_panel, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] @@ -89,7 +90,7 @@ def test_panel_property_types_and_shapes(fitted_dml_obj): assert dml_obj.n_gt_atts == n_treat assert dml_obj.n_rep == N_REP assert dml_obj.n_folds == N_FOLDS - assert dml_obj._dml_data.n_obs == N_OBS + assert dml_obj._dml_data.n_obs == N_OBS * N_PERIODS assert dml_obj.n_rep_boot == N_REP_BOOT assert isinstance(dml_obj.all_coef, np.ndarray) diff --git a/doubleml/did/tests/test_return_types.py b/doubleml/did/tests/test_return_types.py index a59cec6c..1b6fa736 100644 --- a/doubleml/did/tests/test_return_types.py +++ b/doubleml/did/tests/test_return_types.py @@ -79,7 +79,8 @@ def test_sensitivity_return_types(fitted_dml_obj): # panel data -df_panel = make_did_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=5, time_type="float") +N_PERIODS = 5 +df_panel = make_did_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=N_PERIODS, time_type="float") df_panel["y_binary"] = np.random.binomial(n=1, p=0.5, size=df_panel.shape[0]) datasets["did_panel"] = DoubleMLPanelData( df_panel, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] @@ -160,7 +161,15 @@ def fitted_panel_dml_obj(request): @pytest.mark.ci def test_panel_property_types_and_shapes(fitted_panel_dml_obj): - check_basic_property_types_and_shapes(fitted_panel_dml_obj, N_OBS, N_TREAT, N_REP, N_FOLDS, N_REP_BOOT) + check_basic_property_types_and_shapes( + fitted_panel_dml_obj, + n_obs=N_PERIODS * N_OBS, + n_treat=N_TREAT, + n_rep=N_REP, + n_folds=N_FOLDS, + n_rep_boot=N_REP_BOOT, + score_dim=(N_OBS, N_REP, N_TREAT), + ) check_basic_predictions_and_targets(fitted_panel_dml_obj, N_OBS, N_TREAT, N_REP) diff --git a/doubleml/utils/_check_return_types.py b/doubleml/utils/_check_return_types.py index 54462059..54e72833 100644 --- a/doubleml/utils/_check_return_types.py +++ b/doubleml/utils/_check_return_types.py @@ -31,10 +31,14 @@ def check_basic_return_types(dml_obj, cls): assert isinstance(dml_obj._dml_data.__str__(), str) -def check_basic_property_types_and_shapes(dml_obj, n_obs, n_treat, n_rep, n_folds, n_rep_boot): +def check_basic_property_types_and_shapes(dml_obj, n_obs, n_treat, n_rep, n_folds, n_rep_boot, score_dim=None): # not checked: learner, learner_names, params, params_names, score # already checked: summary + # use default combination + if score_dim is None: + score_dim = (n_obs, n_rep, n_treat) + # check that the setting is still in line with the hard-coded values assert dml_obj._dml_data.n_treat == n_treat assert dml_obj.n_rep == n_rep @@ -55,35 +59,19 @@ def check_basic_property_types_and_shapes(dml_obj, n_obs, n_treat, n_rep, n_fold assert dml_obj.coef.shape == (n_treat,) assert isinstance(dml_obj.psi, np.ndarray) - assert dml_obj.psi.shape == ( - n_obs, - n_rep, - n_treat, - ) + assert dml_obj.psi.shape == score_dim is_nonlinear = isinstance(dml_obj, NonLinearScoreMixin) if is_nonlinear: for score_element in dml_obj._score_element_names: assert isinstance(dml_obj.psi_elements[score_element], np.ndarray) - assert dml_obj.psi_elements[score_element].shape == ( - n_obs, - n_rep, - n_treat, - ) + assert dml_obj.psi_elements[score_element].shape == score_dim else: assert isinstance(dml_obj.psi_elements["psi_a"], np.ndarray) - assert dml_obj.psi_elements["psi_a"].shape == ( - n_obs, - n_rep, - n_treat, - ) + assert dml_obj.psi_elements["psi_a"].shape == score_dim assert isinstance(dml_obj.psi_elements["psi_b"], np.ndarray) - assert dml_obj.psi_elements["psi_b"].shape == ( - n_obs, - n_rep, - n_treat, - ) + assert dml_obj.psi_elements["psi_b"].shape == score_dim assert isinstance(dml_obj.framework, DoubleMLFramework) assert isinstance(dml_obj.pval, np.ndarray) From 16624d5677cbf95dad815d6536b54e547bd0db05 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Fri, 6 Jun 2025 11:10:57 +0200 Subject: [PATCH 32/84] fix docstr --- doubleml/data/did_data.py | 3 ++- doubleml/data/rdd_data.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doubleml/data/did_data.py b/doubleml/data/did_data.py index c7909b4e..fd4fc7de 100644 --- a/doubleml/data/did_data.py +++ b/doubleml/data/did_data.py @@ -181,7 +181,8 @@ def from_arrays( Default is ``True``. Examples - -------- >>> from doubleml import DoubleMLDIDData + -------- + >>> from doubleml import DoubleMLDIDData >>> from doubleml.did.datasets import make_did_SZ2020 >>> (x, y, d, t) = make_did_SZ2020(return_type='array') >>> obj_dml_data_from_array = DoubleMLDIDData.from_arrays(x, y, d, t=t) diff --git a/doubleml/data/rdd_data.py b/doubleml/data/rdd_data.py index ac0fff67..f19a4fa0 100644 --- a/doubleml/data/rdd_data.py +++ b/doubleml/data/rdd_data.py @@ -59,7 +59,8 @@ class DoubleMLRDDData(DoubleMLData): Default is ``True``. Examples - -------- >>> from doubleml import DoubleMLRDDData + -------- + >>> from doubleml import DoubleMLRDDData >>> from doubleml.rdd.datasets import make_rdd_data >>> # initialization from pandas.DataFrame >>> df = make_rdd_data(return_type='DataFrame') From 7d6ef350f5116241a84017e49ae5e9dd59f56895 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Jun 2025 12:18:04 +0200 Subject: [PATCH 33/84] fix order test --- doubleml/did/did_cs_binary.py | 2 +- ...test_did_cs_binary_vs_did_cs_two_period.py | 47 ++++++++++++++++++- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index e1786242..5d6e3638 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -462,7 +462,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa ) extend_kwargs = { - "n_obs": self._dml_data.data.shape[0], + "n_obs": self._dml_data.n_obs, "id_positions": self.id_positions, } psi_elements = { diff --git a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py index 2c8c34f3..a0a25718 100644 --- a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py +++ b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py @@ -51,10 +51,14 @@ def dml_did_cs_binary_vs_did_cs_fixture(generate_data_did_binary, learner, score # collect data dml_panel_data = generate_data_did_binary df = dml_panel_data._data.sort_values(by=["id", "t"]) + # Reorder data before to make both approaches compatible + dml_panel_data = dml.data.DoubleMLPanelData( + df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] + ) + obj_dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) n_obs = df.shape[0] all_smpls = draw_smpls(n_obs, n_folds) - obj_dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) # Set machine learning methods for m & g ml_g = clone(learner[0]) @@ -161,3 +165,44 @@ def test_coefs(dml_did_cs_binary_vs_did_cs_fixture): rel_tol=1e-9, abs_tol=1e-4, ) + + +@pytest.mark.ci +def test_ses(dml_did_cs_binary_vs_did_cs_fixture): + assert math.isclose( + dml_did_cs_binary_vs_did_cs_fixture["se"][0], + dml_did_cs_binary_vs_did_cs_fixture["se_manual"], + rel_tol=1e-9, + abs_tol=1e-4, + ) + assert math.isclose( + dml_did_cs_binary_vs_did_cs_fixture["se_binary"][0], + dml_did_cs_binary_vs_did_cs_fixture["se"][0], + rel_tol=1e-9, + abs_tol=1e-4, + ) + + +@pytest.mark.ci +def test_boot(dml_did_cs_binary_vs_did_cs_fixture): + for bootstrap in dml_did_cs_binary_vs_did_cs_fixture["boot_methods"]: + assert np.allclose( + dml_did_cs_binary_vs_did_cs_fixture["boot_t_stat" + bootstrap], + dml_did_cs_binary_vs_did_cs_fixture["boot_t_stat" + bootstrap + "_manual"], + atol=1e-4, + ) + assert np.allclose( + dml_did_cs_binary_vs_did_cs_fixture["boot_t_stat" + bootstrap], + dml_did_cs_binary_vs_did_cs_fixture["boot_t_stat" + bootstrap + "_binary"], + atol=1e-4, + ) + + +@pytest.mark.ci +def test_nuisance_loss(dml_did_cs_binary_vs_did_cs_fixture): + assert ( + dml_did_cs_binary_vs_did_cs_fixture["nuisance_loss"].keys() + == dml_did_cs_binary_vs_did_cs_fixture["nuisance_loss_binary"].keys() + ) + for key, value in dml_did_cs_binary_vs_did_cs_fixture["nuisance_loss"].items(): + assert np.allclose(value, dml_did_cs_binary_vs_did_cs_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3) From 18c38445220a723dc6935fa3c9f788aea4e54e48 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Jun 2025 12:30:57 +0200 Subject: [PATCH 34/84] add sensitivity estimation to did_cs_binary --- doubleml/did/did_cs_binary.py | 101 +++++++++++++++++- ...test_did_cs_binary_vs_did_cs_two_period.py | 76 ++++++++++++- 2 files changed, 174 insertions(+), 3 deletions(-) diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index 5d6e3638..479cba93 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -153,7 +153,7 @@ def __init__( self._trimming_threshold = trimming_threshold _check_trimming(self._trimming_rule, self._trimming_threshold) - self._sensitivity_implemented = False + self._sensitivity_implemented = True self._external_predictions_implemented = True @property @@ -589,4 +589,101 @@ def _nuisance_tuning( pass def _sensitivity_element_est(self, preds): - pass + y = self._y_data + d = self._g_data + t = self._t_data + + m_hat = _get_id_positions(preds["predictions"]["ml_m"], self.id_positions) + g_hat_d0_t0 = _get_id_positions(preds["predictions"]["ml_g_d0_t0"], self.id_positions) + g_hat_d0_t1 = _get_id_positions(preds["predictions"]["ml_g_d0_t1"], self.id_positions) + g_hat_d1_t0 = _get_id_positions(preds["predictions"]["ml_g_d1_t0"], self.id_positions) + g_hat_d1_t1 = _get_id_positions(preds["predictions"]["ml_g_d1_t1"], self.id_positions) + + d0t0 = np.multiply(1.0 - d, 1.0 - t) + d0t1 = np.multiply(1.0 - d, t) + d1t0 = np.multiply(d, 1.0 - t) + d1t1 = np.multiply(d, t) + + g_hat = ( + np.multiply(d0t0, g_hat_d0_t0) + + np.multiply(d0t1, g_hat_d0_t1) + + np.multiply(d1t0, g_hat_d1_t0) + + np.multiply(d1t1, g_hat_d1_t1) + ) + sigma2_score_element = np.square(y - g_hat) + sigma2 = np.mean(sigma2_score_element) + psi_sigma2 = sigma2_score_element - sigma2 + + # calc m(W,alpha) and Riesz representer + p_hat = np.mean(d) + lambda_hat = np.mean(t) + if self.score == "observational": + propensity_weight_d0 = np.divide(m_hat, 1.0 - m_hat) + if self.in_sample_normalization: + weight_d0t1 = np.multiply(d0t1, propensity_weight_d0) + weight_d0t0 = np.multiply(d0t0, propensity_weight_d0) + mean_weight_d0t1 = np.mean(weight_d0t1) + mean_weight_d0t0 = np.mean(weight_d0t0) + + m_alpha = np.multiply( + np.divide(d, p_hat), + np.divide(1.0, np.mean(d1t1)) + + np.divide(1.0, np.mean(d1t0)) + + np.divide(propensity_weight_d0, mean_weight_d0t1) + + np.divide(propensity_weight_d0, mean_weight_d0t0), + ) + + rr = ( + np.divide(d1t1, np.mean(d1t1)) + - np.divide(d1t0, np.mean(d1t0)) + - np.divide(weight_d0t1, mean_weight_d0t1) + + np.divide(weight_d0t0, mean_weight_d0t0) + ) + else: + m_alpha_1 = np.divide(1.0, lambda_hat) + np.divide(1.0, 1.0 - lambda_hat) + m_alpha = np.multiply(np.divide(d, np.square(p_hat)), np.multiply(m_alpha_1, 1.0 + propensity_weight_d0)) + + rr_1 = np.divide(t, np.multiply(p_hat, lambda_hat)) + np.divide(1.0 - t, np.multiply(p_hat, 1.0 - lambda_hat)) + rr_2 = d + np.multiply(1.0 - d, propensity_weight_d0) + rr = np.multiply(rr_1, rr_2) + else: + assert self.score == "experimental" + if self.in_sample_normalization: + m_alpha = ( + np.divide(1.0, np.mean(d1t1)) + + np.divide(1.0, np.mean(d1t0)) + + np.divide(1.0, np.mean(d0t1)) + + np.divide(1.0, np.mean(d0t0)) + ) + rr = ( + np.divide(d1t1, np.mean(d1t1)) + - np.divide(d1t0, np.mean(d1t0)) + - np.divide(d0t1, np.mean(d0t1)) + + np.divide(d0t0, np.mean(d0t0)) + ) + else: + m_alpha = ( + np.divide(1.0, np.multiply(p_hat, lambda_hat)) + + np.divide(1.0, np.multiply(p_hat, 1.0 - lambda_hat)) + + np.divide(1.0, np.multiply(1.0 - p_hat, lambda_hat)) + + np.divide(1.0, np.multiply(1.0 - p_hat, 1.0 - lambda_hat)) + ) + rr = ( + np.divide(d1t1, np.multiply(p_hat, lambda_hat)) + - np.divide(d1t0, np.multiply(p_hat, 1.0 - lambda_hat)) + - np.divide(d0t1, np.multiply(1.0 - p_hat, lambda_hat)) + + np.divide(d0t0, np.multiply(1.0 - p_hat, 1.0 - lambda_hat)) + ) + + nu2_score_element = np.multiply(2.0, m_alpha) - np.square(rr) + nu2 = np.mean(nu2_score_element) + psi_nu2 = nu2_score_element - nu2 + + element_dict = { + "sigma2": sigma2, + "nu2": nu2, + "psi_sigma2": psi_sigma2, + "psi_nu2": psi_nu2, + "riesz_rep": rr, + } + return element_dict diff --git a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py index a0a25718..73e6b827 100644 --- a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py +++ b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py @@ -9,7 +9,7 @@ import doubleml as dml from ...tests._utils import draw_smpls -from ._utils_did_cs_manual import fit_did_cs +from ._utils_did_cs_manual import fit_did_cs, fit_sensitivity_elements_did_cs from ._utils_did_manual import boot_did @@ -148,6 +148,30 @@ def dml_did_cs_binary_vs_did_cs_fixture(generate_data_did_binary, learner, score res_dict["boot_t_stat" + bootstrap + "_binary"] = dml_did_binary_obj.boot_t_stat res_dict["boot_t_stat" + bootstrap + "_manual"] = boot_t_stat.reshape(-1, 1, 1) + # sensitivity tests + res_dict["sensitivity_elements"] = dml_did_obj.sensitivity_elements + res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.sensitivity_elements + res_dict["sensitivity_elements_manual"] = fit_sensitivity_elements_did_cs( + y, + d, + t, + all_coef=dml_did_obj.all_coef, + predictions=dml_did_obj.predictions, + score=score, + in_sample_normalization=in_sample_normalization, + n_rep=1, + ) + + # sensitivity tests + res_dict["sensitivity_elements"] = dml_did_obj.sensitivity_elements + res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.sensitivity_elements + + dml_did_obj.sensitivity_analysis() + dml_did_binary_obj.sensitivity_analysis() + + res_dict["sensitivity_params"] = dml_did_obj.sensitivity_params + res_dict["sensitivity_params_binary"] = dml_did_binary_obj.sensitivity_params + return res_dict @@ -206,3 +230,53 @@ def test_nuisance_loss(dml_did_cs_binary_vs_did_cs_fixture): ) for key, value in dml_did_cs_binary_vs_did_cs_fixture["nuisance_loss"].items(): assert np.allclose(value, dml_did_cs_binary_vs_did_cs_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3) + + +@pytest.mark.ci +def test_sensitivity_elements(dml_did_cs_binary_vs_did_cs_fixture): + sensitivity_element_names = ["sigma2", "nu2", "psi_sigma2", "psi_nu2"] + for sensitivity_element in sensitivity_element_names: + assert np.allclose( + dml_did_cs_binary_vs_did_cs_fixture["sensitivity_elements"][sensitivity_element], + dml_did_cs_binary_vs_did_cs_fixture["sensitivity_elements_manual"][sensitivity_element], + rtol=1e-9, + atol=1e-4, + ) + assert np.allclose( + dml_did_cs_binary_vs_did_cs_fixture["sensitivity_elements"][sensitivity_element], + dml_did_cs_binary_vs_did_cs_fixture["sensitivity_elements_binary"][sensitivity_element], + rtol=1e-9, + atol=1e-4, + ) + for sensitivity_element in ["riesz_rep"]: + assert np.allclose( + dml_did_cs_binary_vs_did_cs_fixture["sensitivity_elements"][sensitivity_element], + dml_did_cs_binary_vs_did_cs_fixture["sensitivity_elements_binary"][sensitivity_element], + rtol=1e-9, + atol=1e-4, + ) + + +@pytest.mark.ci +def test_sensitivity_params(dml_did_cs_binary_vs_did_cs_fixture): + for key in ["theta", "se", "ci"]: + assert np.allclose( + dml_did_cs_binary_vs_did_cs_fixture["sensitivity_params"][key]["lower"], + dml_did_cs_binary_vs_did_cs_fixture["sensitivity_params_binary"][key]["lower"], + rtol=1e-9, + atol=1e-4, + ) + assert np.allclose( + dml_did_cs_binary_vs_did_cs_fixture["sensitivity_params"][key]["upper"], + dml_did_cs_binary_vs_did_cs_fixture["sensitivity_params_binary"][key]["upper"], + rtol=1e-9, + atol=1e-4, + ) + + for key in ["rv", "rva"]: + assert np.allclose( + dml_did_cs_binary_vs_did_cs_fixture["sensitivity_params"][key], + dml_did_cs_binary_vs_did_cs_fixture["sensitivity_params_binary"][key], + rtol=1e-9, + atol=1e-4, + ) From 5d2232b455bede3866f0c2b7626f682f4cbad14d Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Jun 2025 13:00:20 +0200 Subject: [PATCH 35/84] fix id positions and scaling for sensitivity --- doubleml/did/did_cs_binary.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index 479cba93..6b2206a3 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -679,11 +679,19 @@ def _sensitivity_element_est(self, preds): nu2 = np.mean(nu2_score_element) psi_nu2 = nu2_score_element - nu2 + extend_kwargs = { + "n_obs": self._dml_data.n_obs, + "id_positions": self.id_positions, + "fill_value": 0.0, + } + + # add scaling to make variance estimation consistent (sample size difference) + scaling = self._dml_data.n_obs / self._n_obs_subset element_dict = { "sigma2": sigma2, "nu2": nu2, - "psi_sigma2": psi_sigma2, - "psi_nu2": psi_nu2, - "riesz_rep": rr, + "psi_sigma2": scaling * _set_id_positions(psi_sigma2, **extend_kwargs), + "psi_nu2": scaling * _set_id_positions(psi_nu2, **extend_kwargs), + "riesz_rep": scaling * _set_id_positions(rr, **extend_kwargs), } return element_dict From 7f01b6b5accc1293fba5435ae81129cca4b5f630 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Jun 2025 13:00:33 +0200 Subject: [PATCH 36/84] add placebo test for did_cs_binary --- .../did/tests/test_did_cs_binary_placebo.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 doubleml/did/tests/test_did_cs_binary_placebo.py diff --git a/doubleml/did/tests/test_did_cs_binary_placebo.py b/doubleml/did/tests/test_did_cs_binary_placebo.py new file mode 100644 index 00000000..61def691 --- /dev/null +++ b/doubleml/did/tests/test_did_cs_binary_placebo.py @@ -0,0 +1,58 @@ +import numpy as np +import pytest +from lightgbm import LGBMClassifier, LGBMRegressor + +from doubleml.data import DoubleMLPanelData +from doubleml.did import DoubleMLDIDCSBinary +from doubleml.did.datasets import make_did_CS2021 + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def did_score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_did_fixture(did_score, n_rep): + n_obs = 500 + dgp = 5 # has to be experimental (for experimental score to be valid) + df = make_did_CS2021(n_obs=n_obs, dgp=dgp, n_pre_treat_periods=3) + dml_data = DoubleMLPanelData(df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=["Z1", "Z2", "Z3", "Z4"]) + + kwargs = { + "obj_dml_data": dml_data, + "g_value": dml_data.g_values[0], + "t_value_pre": dml_data.t_values[0], + "t_value_eval": dml_data.t_values[1], + "ml_g": LGBMRegressor(verbose=-1), + "ml_m": LGBMClassifier(verbose=-1), + "score": did_score, + "n_rep": n_rep, + "n_folds": 5, + "draw_sample_splitting": True, + } + + dml_did = DoubleMLDIDCSBinary(**kwargs) + + np.random.seed(3141) + dml_did.fit() + ci = dml_did.confint(level=0.99) + + res_dict = { + "coef": dml_did.coef[0], + "ci_lower": ci.iloc[0, 0], + "ci_upper": ci.iloc[0, 1], + } + + return res_dict + + +@pytest.mark.ci +def test_zero(doubleml_did_fixture): + assert doubleml_did_fixture["ci_lower"] <= 0.0 + assert doubleml_did_fixture["ci_upper"] >= 0.0 From 3fafccc2cddb36397880ea9dfed993efe8c8d0ad Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Jun 2025 13:08:45 +0200 Subject: [PATCH 37/84] extend ext prediction tests for did_cs_binary --- ...test_did_cs_binary_external_predictions.py | 81 ++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/doubleml/did/tests/test_did_cs_binary_external_predictions.py b/doubleml/did/tests/test_did_cs_binary_external_predictions.py index 4e09dfe0..477c6dc7 100644 --- a/doubleml/did/tests/test_did_cs_binary_external_predictions.py +++ b/doubleml/did/tests/test_did_cs_binary_external_predictions.py @@ -4,8 +4,9 @@ import pytest from sklearn.linear_model import LinearRegression, LogisticRegression +from doubleml.data import DoubleMLPanelData from doubleml.did import DoubleMLDIDCSBinary -from doubleml.did.datasets import make_did_SZ2020 +from doubleml.did.datasets import make_did_cs_CS2021, make_did_SZ2020 from doubleml.tests._utils import draw_smpls from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor @@ -90,3 +91,81 @@ def test_score(doubleml_did_cs_fixture): def test_nuisance_loss(doubleml_did_cs_fixture): for key, value in doubleml_did_cs_fixture["dml_did_nuisance_loss"].items(): assert np.allclose(value, doubleml_did_cs_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3) + + +@pytest.fixture(scope="module") +def doubleml_did_cs_panel_fixture(did_score, n_rep): + n_obs = 500 + n_folds = 5 + dgp = 1 + + ext_predictions = {"d": {}} + df = make_did_cs_CS2021(n_obs=n_obs, dgp_type=dgp, time_type="float") + dml_panel_data = DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) + + kwargs = { + "obj_dml_data": dml_panel_data, + "g_value": 2, + "t_value_pre": 0, + "t_value_eval": 1, + "score": did_score, + "n_rep": n_rep, + "draw_sample_splitting": False, + } + + dml_did = DoubleMLDIDCSBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs) + all_smpls = draw_smpls(n_obs=dml_did.n_obs_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_data) + dml_did.set_sample_splitting(all_smpls) + + np.random.seed(3141) + dml_did.fit(store_predictions=True) + + all_keys = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1"] + for key in all_keys: + ext_predictions["d"][key] = dml_did.predictions[key][:, :, 0] + if did_score == "observational": + ext_predictions["d"]["ml_m"] = dml_did.predictions["ml_m"][:, :, 0] + dml_did_ext = DoubleMLDIDCSBinary(ml_g=DMLDummyRegressor(), ml_m=DMLDummyClassifier(), **kwargs) + dml_did_ext.set_sample_splitting(all_smpls) + np.random.seed(3141) + dml_did_ext.fit(external_predictions=ext_predictions) + + res_dict = { + "coef": dml_did.coef[0], + "coef_ext": dml_did_ext.coef[0], + "se": dml_did.se[0], + "se_ext": dml_did_ext.se[0], + "score": dml_did.psi, + "score_ext": dml_did_ext.psi, + "dml_did_nuisance_loss": dml_did.nuisance_loss, + "dml_did_ext_nuisance_loss": dml_did_ext.nuisance_loss, + } + + return res_dict + + +@pytest.mark.ci +def test_panel_coef(doubleml_did_cs_panel_fixture): + assert math.isclose( + doubleml_did_cs_panel_fixture["coef"], doubleml_did_cs_panel_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3 + ) + + +@pytest.mark.ci +def test_panel_se(doubleml_did_cs_panel_fixture): + assert math.isclose( + doubleml_did_cs_panel_fixture["se"], doubleml_did_cs_panel_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3 + ) + + +@pytest.mark.ci +def test_panel_score(doubleml_did_cs_panel_fixture): + assert np.allclose( + doubleml_did_cs_panel_fixture["score"], doubleml_did_cs_panel_fixture["score_ext"], rtol=1e-9, atol=1e-3 + ) + + +@pytest.mark.ci +def test_panel_nuisance_loss(doubleml_did_cs_panel_fixture): + for key, value in doubleml_did_cs_panel_fixture["dml_did_nuisance_loss"].items(): + assert np.allclose(value, doubleml_did_cs_panel_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3) From 9e378518109c15529782025e646825f071790d1c Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Jun 2025 13:11:44 +0200 Subject: [PATCH 38/84] add control group test for did_cs_binary --- .../test_did_cs_binary_control_groups.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 doubleml/did/tests/test_did_cs_binary_control_groups.py diff --git a/doubleml/did/tests/test_did_cs_binary_control_groups.py b/doubleml/did/tests/test_did_cs_binary_control_groups.py new file mode 100644 index 00000000..ea4f2933 --- /dev/null +++ b/doubleml/did/tests/test_did_cs_binary_control_groups.py @@ -0,0 +1,31 @@ +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml + +df = dml.did.datasets.make_did_cs_CS2021(n_obs=500, dgp_type=1, n_pre_treat_periods=2, n_periods=4, time_type="float") +dml_data = dml.data.DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) + +args = { + "obj_dml_data": dml_data, + "ml_g": LinearRegression(), + "ml_m": LogisticRegression(), + "g_value": 2, + "t_value_pre": 0, + "t_value_eval": 1, + "score": "observational", + "n_rep": 1, +} + + +def test_control_groups_different(): + dml_did_never_treated = dml.did.DoubleMLDIDCSBinary(control_group="never_treated", **args) + dml_did_not_yet_treated = dml.did.DoubleMLDIDCSBinary(control_group="not_yet_treated", **args) + + assert dml_did_never_treated.n_obs_subset != dml_did_not_yet_treated.n_obs_subset + # same treatment group + assert dml_did_never_treated.data_subset["G_indicator"].sum() == dml_did_not_yet_treated.data_subset["G_indicator"].sum() + + dml_did_never_treated.fit() + dml_did_not_yet_treated.fit() + + assert dml_did_never_treated.coef != dml_did_not_yet_treated.coef From 810eade37a837a3f8c3f0dbae8728c51fc8fea79 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Jun 2025 13:40:50 +0200 Subject: [PATCH 39/84] add tune to did_cs_binary --- doubleml/did/did_cs_binary.py | 115 ++++++++- doubleml/did/tests/test_did_cs_binary_tune.py | 221 ++++++++++++++++++ 2 files changed, 334 insertions(+), 2 deletions(-) create mode 100644 doubleml/did/tests/test_did_cs_binary_tune.py diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index 6b2206a3..161a31c3 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -23,7 +23,7 @@ _check_score, _check_trimming, ) -from doubleml.utils._estimation import _dml_cv_predict, _get_cond_smpls_2d +from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls_2d from doubleml.utils._propensity_score import _trimm @@ -586,7 +586,118 @@ def _score_elements(self, y, d, t, g_hat_d0_t0, g_hat_d0_t1, g_hat_d1_t0, g_hat_ def _nuisance_tuning( self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search ): - pass + x, y = check_X_y(X=self._x_data, y=self._y_data, force_all_finite=False) + _, d = check_X_y(x, self._g_data, force_all_finite=False) # (d is the G_indicator) + _, t = check_X_y(x, self._t_data, force_all_finite=False) + + if scoring_methods is None: + scoring_methods = {"ml_g": None, "ml_m": None} + + # nuisance training sets conditional on d and t + smpls_d0_t0, smpls_d0_t1, smpls_d1_t0, smpls_d1_t1 = _get_cond_smpls_2d(smpls, d, t) + train_inds = [train_index for (train_index, _) in smpls] + train_inds_d0_t0 = [train_index for (train_index, _) in smpls_d0_t0] + train_inds_d0_t1 = [train_index for (train_index, _) in smpls_d0_t1] + train_inds_d1_t0 = [train_index for (train_index, _) in smpls_d1_t0] + train_inds_d1_t1 = [train_index for (train_index, _) in smpls_d1_t1] + + tune_args = { + "n_folds_tune": n_folds_tune, + "n_jobs_cv": n_jobs_cv, + "search_mode": search_mode, + "n_iter_randomized_search": n_iter_randomized_search, + } + + g_d0_t0_tune_res = _dml_tune( + y, + x, + train_inds_d0_t0, + self._learner["ml_g"], + param_grids["ml_g"], + scoring_methods["ml_g"], + **tune_args, + ) + + g_d0_t1_tune_res = _dml_tune( + y, + x, + train_inds_d0_t1, + self._learner["ml_g"], + param_grids["ml_g"], + scoring_methods["ml_g"], + **tune_args, + ) + + g_d1_t0_tune_res = _dml_tune( + y, + x, + train_inds_d1_t0, + self._learner["ml_g"], + param_grids["ml_g"], + scoring_methods["ml_g"], + **tune_args, + ) + + g_d1_t1_tune_res = _dml_tune( + y, + x, + train_inds_d1_t1, + self._learner["ml_g"], + param_grids["ml_g"], + scoring_methods["ml_g"], + **tune_args, + ) + + m_tune_res = list() + if self.score == "observational": + m_tune_res = _dml_tune( + d, + x, + train_inds, + self._learner["ml_m"], + param_grids["ml_m"], + scoring_methods["ml_m"], + **tune_args, + ) + + g_d0_t0_best_params = [xx.best_params_ for xx in g_d0_t0_tune_res] + g_d0_t1_best_params = [xx.best_params_ for xx in g_d0_t1_tune_res] + g_d1_t0_best_params = [xx.best_params_ for xx in g_d1_t0_tune_res] + g_d1_t1_best_params = [xx.best_params_ for xx in g_d1_t1_tune_res] + + if self.score == "observational": + m_best_params = [xx.best_params_ for xx in m_tune_res] + params = { + "ml_g_d0_t0": g_d0_t0_best_params, + "ml_g_d0_t1": g_d0_t1_best_params, + "ml_g_d1_t0": g_d1_t0_best_params, + "ml_g_d1_t1": g_d1_t1_best_params, + "ml_m": m_best_params, + } + tune_res = { + "g_d0_t0_tune": g_d0_t0_tune_res, + "g_d0_t1_tune": g_d0_t1_tune_res, + "g_d1_t0_tune": g_d1_t0_tune_res, + "g_d1_t1_tune": g_d1_t1_tune_res, + "m_tune": m_tune_res, + } + else: + params = { + "ml_g_d0_t0": g_d0_t0_best_params, + "ml_g_d0_t1": g_d0_t1_best_params, + "ml_g_d1_t0": g_d1_t0_best_params, + "ml_g_d1_t1": g_d1_t1_best_params, + } + tune_res = { + "g_d0_t0_tune": g_d0_t0_tune_res, + "g_d0_t1_tune": g_d0_t1_tune_res, + "g_d1_t0_tune": g_d1_t0_tune_res, + "g_d1_t1_tune": g_d1_t1_tune_res, + } + + res = {"params": params, "tune_res": tune_res} + + return res def _sensitivity_element_est(self, preds): y = self._y_data diff --git a/doubleml/did/tests/test_did_cs_binary_tune.py b/doubleml/did/tests/test_did_cs_binary_tune.py new file mode 100644 index 00000000..0bd2c6ab --- /dev/null +++ b/doubleml/did/tests/test_did_cs_binary_tune.py @@ -0,0 +1,221 @@ +import math + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import LogisticRegression + +import doubleml as dml + +from ...tests._utils import draw_smpls +from ._utils_did_cs_manual import fit_did_cs, tune_nuisance_did_cs +from ._utils_did_manual import boot_did + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_g(request): + return request.param + + +@pytest.fixture(scope="module", params=[LogisticRegression()]) +def learner_m(request): + return request.param + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def in_sample_normalization(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def tune_on_folds(request): + return request.param + + +def get_par_grid(learner): + if learner.__class__ in [RandomForestRegressor]: + par_grid = {"n_estimators": [5, 10, 20]} + else: + assert learner.__class__ in [LogisticRegression] + par_grid = {"C": np.logspace(-4, 2, 10)} + return par_grid + + +@pytest.fixture(scope="module") +def dml_did_fixture(generate_data_did_binary, learner_g, learner_m, score, in_sample_normalization, tune_on_folds): + par_grid = {"ml_g": get_par_grid(learner_g), "ml_m": get_par_grid(learner_m)} + n_folds_tune = 4 + + boot_methods = ["normal"] + n_folds = 2 + n_rep_boot = 499 + + # collect data + dml_panel_data = generate_data_did_binary + df = dml_panel_data._data.sort_values(by=["id", "t"]) + # Reorder data before to make both approaches compatible + dml_panel_data = dml.data.DoubleMLPanelData( + df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] + ) + obj_dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) + + n_obs = df.shape[0] + strata = df["d"] + 2 * df["t"] # only valid since it values are binary + all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=strata) + + # Set machine learning methods for m & g + ml_g = clone(learner_g) + ml_m = clone(learner_m) + + dml_args = { + "ml_g": ml_g, + "ml_m": ml_m, + "n_folds": n_folds, + "score": score, + "in_sample_normalization": in_sample_normalization, + "draw_sample_splitting": False, + } + + dml_did_binary_obj = dml.did.DoubleMLDIDCSBinary( + dml_panel_data, + g_value=1, + t_value_pre=0, + t_value_eval=1, + **dml_args, + ) + + dml_did_obj = dml.DoubleMLDIDCS( + obj_dml_data, + **dml_args, + ) + + # synchronize the sample splitting + dml_did_obj.set_sample_splitting(all_smpls=all_smpls) + dml_did_binary_obj.set_sample_splitting(all_smpls=all_smpls) + + # tune hyperparameters + np.random.seed(3141) + tune_res = dml_did_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False) + assert isinstance(tune_res, dml.DoubleMLDIDCS) + np.random.seed(3141) + tune_res_binary = dml_did_binary_obj.tune( + par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False + ) + assert isinstance(tune_res_binary, dml.did.DoubleMLDIDCSBinary) + + dml_did_obj.fit() + dml_did_binary_obj.fit() + + # manual fit + y = df["y"].values + d = df["d"].values + x = df[["Z1", "Z2", "Z3", "Z4"]].values + t = df["t"].values + np.random.seed(3141) + smpls = all_smpls[0] + + if tune_on_folds: + g_d0_t0_params, g_d0_t1_params, g_d1_t0_params, g_d1_t1_params, m_params = tune_nuisance_did_cs( + y, x, d, t, clone(learner_g), clone(learner_m), smpls, score, n_folds_tune, par_grid["ml_g"], par_grid["ml_m"] + ) + else: + xx = [(np.arange(len(y)), np.array([]))] + g_d0_t0_params, g_d0_t1_params, g_d1_t0_params, g_d1_t1_params, m_params = tune_nuisance_did_cs( + y, x, d, t, clone(learner_g), clone(learner_m), xx, score, n_folds_tune, par_grid["ml_g"], par_grid["ml_m"] + ) + g_d0_t0_params = g_d0_t0_params * n_folds + g_d0_t1_params = g_d0_t1_params * n_folds + g_d1_t0_params = g_d1_t0_params * n_folds + g_d1_t1_params = g_d1_t1_params * n_folds + if score == "observational": + m_params = m_params * n_folds + else: + assert score == "experimental" + m_params = None + + res_manual = fit_did_cs( + y, + x, + d, + t, + clone(learner_g), + clone(learner_m), + all_smpls, + score, + in_sample_normalization, + g_d0_t0_params=g_d0_t0_params, + g_d0_t1_params=g_d0_t1_params, + g_d1_t0_params=g_d1_t0_params, + g_d1_t1_params=g_d1_t1_params, + m_params=m_params, + ) + + res_dict = { + "coef": dml_did_obj.coef, + "coef_binary": dml_did_binary_obj.coef, + "coef_manual": res_manual["theta"], + "se": dml_did_obj.se, + "se_binary": dml_did_binary_obj.se, + "se_manual": res_manual["se"], + "boot_methods": boot_methods, + } + + for bootstrap in boot_methods: + np.random.seed(3141) + boot_t_stat = boot_did( + y, + res_manual["thetas"], + res_manual["ses"], + res_manual["all_psi_a"], + res_manual["all_psi_b"], + all_smpls, + bootstrap, + n_rep_boot, + ) + + np.random.seed(3141) + dml_did_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + np.random.seed(3141) + dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + + res_dict["boot_t_stat" + bootstrap] = dml_did_obj.boot_t_stat + res_dict["boot_t_stat" + bootstrap + "_binary"] = dml_did_binary_obj.boot_t_stat + res_dict["boot_t_stat" + bootstrap + "_manual"] = boot_t_stat.reshape(-1, 1, 1) + + return res_dict + + +@pytest.mark.ci +def test_dml_did_coef(dml_did_fixture): + assert math.isclose(dml_did_fixture["coef"][0], dml_did_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_did_fixture["coef_binary"][0], dml_did_fixture["coef"][0], rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_dml_did_se(dml_did_fixture): + assert math.isclose(dml_did_fixture["se"][0], dml_did_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_did_fixture["se_binary"][0], dml_did_fixture["se"][0], rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_boot(dml_did_fixture): + for bootstrap in dml_did_fixture["boot_methods"]: + assert np.allclose( + dml_did_fixture["boot_t_stat" + bootstrap], + dml_did_fixture["boot_t_stat" + bootstrap + "_manual"], + rtol=1e-9, + atol=1e-4, + ) + + assert np.allclose( + dml_did_fixture["boot_t_stat" + bootstrap], + dml_did_fixture["boot_t_stat" + bootstrap + "_binary"], + rtol=1e-9, + atol=1e-4, + ) From 6b6116cb608414a3c9313447d89c51a0c04c3651 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Jun 2025 13:53:03 +0200 Subject: [PATCH 40/84] update did_cs_binary sdout test --- .../did/tests/test_did_cs_binary_stdout.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 doubleml/did/tests/test_did_cs_binary_stdout.py diff --git a/doubleml/did/tests/test_did_cs_binary_stdout.py b/doubleml/did/tests/test_did_cs_binary_stdout.py new file mode 100644 index 00000000..16135636 --- /dev/null +++ b/doubleml/did/tests/test_did_cs_binary_stdout.py @@ -0,0 +1,49 @@ +import io +from contextlib import redirect_stdout + +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml + +dml_data = dml.did.datasets.make_did_SZ2020(n_obs=500, dgp_type=1, return_type="DoubleMLPanelData") + + +@pytest.mark.ci +def test_print_periods(): + """Test that print_periods parameter correctly controls output printing.""" + + # Create test data + dml_data = dml.did.datasets.make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData") + + # Test 1: Default case (print_periods=False) - should not print anything + f = io.StringIO() + with redirect_stdout(f): + _ = dml.did.DoubleMLDIDCSBinary( + obj_dml_data=dml_data, + ml_g=LinearRegression(), + ml_m=LogisticRegression(), + g_value=1, + t_value_pre=0, + t_value_eval=1, + print_periods=False, # Default + ) + output_default = f.getvalue() + assert output_default.strip() == "", "Expected no output with print_periods=False" + + # Test 2: With print_periods=True - should print information + f = io.StringIO() + with redirect_stdout(f): + _ = dml.did.DoubleMLDIDCSBinary( + obj_dml_data=dml_data, + ml_g=LinearRegression(), + ml_m=LogisticRegression(), + g_value=1, + t_value_pre=0, + t_value_eval=1, + print_periods=True, + ) + output_print = f.getvalue() + assert "Evaluation of ATT(1, 1), with pre-treatment period 0" in output_print + assert "post-treatment: True" in output_print + assert "Control group: never_treated" in output_print From de324cfe102bc466e571f298e0be2f499911b0f0 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Jun 2025 13:59:54 +0200 Subject: [PATCH 41/84] add exceptions and tests --- doubleml/did/did_cs_binary.py | 28 ++++ .../tests/test_did_cs_binary_exceptions.py | 152 ++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 doubleml/did/tests/test_did_cs_binary_exceptions.py diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index 161a31c3..a34dbf2a 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -806,3 +806,31 @@ def _sensitivity_element_est(self, preds): "riesz_rep": scaling * _set_id_positions(rr, **extend_kwargs), } return element_dict + + def sensitivity_benchmark(self, benchmarking_set, fit_args=None): + """ + Computes a benchmark for a given set of features. + Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates. + + Parameters + ---------- + benchmarking_set : list + List of features to be used for benchmarking. + + fit_args : dict, optional + Additional arguments for the fit method. + Default is None. + + Returns + ------- + benchmark_results : pandas.DataFrame + Benchmark results. + """ + if self.score == "experimental": + warnings.warn( + "Sensitivity benchmarking for experimental score may not be meaningful. " + "Consider using score='observational' for conditional treatment assignment.", + UserWarning, + ) + + return super().sensitivity_benchmark(benchmarking_set, fit_args) diff --git a/doubleml/did/tests/test_did_cs_binary_exceptions.py b/doubleml/did/tests/test_did_cs_binary_exceptions.py new file mode 100644 index 00000000..b506da2d --- /dev/null +++ b/doubleml/did/tests/test_did_cs_binary_exceptions.py @@ -0,0 +1,152 @@ +from unittest.mock import patch + +import numpy as np +import pandas as pd +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml + +dml_data = dml.did.datasets.make_did_SZ2020(n_obs=500, dgp_type=1, return_type="DoubleMLPanelData") + +valid_arguments = { + "obj_dml_data": dml_data, + "ml_g": LinearRegression(), + "ml_m": LogisticRegression(), + "g_value": 1, + "t_value_pre": 0, + "t_value_eval": 1, + "score": "observational", + "n_rep": 1, + "draw_sample_splitting": True, +} + + +@pytest.mark.ci +def test_input(): + # control group + msg = r"The control group has to be one of \['never_treated', 'not_yet_treated'\]. 0 was passed." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"control_group": 0} + _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments)) + + # g value + msg = r"The value test is not in the set of treatment group values \[0 1\]." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"g_value": "test"} + _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments)) + + msg = r"The never treated group is not allowed as treatment group \(g_value=0\)." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"g_value": 0} + _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments)) + + msg = r"The never treated group is not allowed as treatment group \(g_value=0\)." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"g_value": 0.0} + _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments)) + + # t values + msg = r"The value test is not in the set of evaluation period values \[0 1\]." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"t_value_pre": "test"} + _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments)) + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"t_value_eval": "test"} + _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments)) + + # in-sample normalization + msg = "in_sample_normalization indicator has to be boolean. Object of type passed." + with pytest.raises(TypeError, match=msg): + invalid_arguments = {"in_sample_normalization": "test"} + _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments)) + + # ml_g classifier + msg = r"The ml_g learner LogisticRegression\(\) was identified as" + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"ml_g": LogisticRegression()} + _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments)) + + +@pytest.mark.ci +def test_no_control_group_exception(): + msg = "No observations in the control group." + with pytest.raises(ValueError, match=msg): + invalid_data = dml.did.datasets.make_did_SZ2020(n_obs=500, dgp_type=1, return_type="DoubleMLPanelData") + invalid_data.data["d"] = 1.0 + invalid_arguments = {"obj_dml_data": invalid_data, "control_group": "not_yet_treated"} + _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments)) + + +@pytest.mark.ci +def test_check_data_exceptions(): + """Test exception handling for _check_data method in DoubleMLDIDCSBinary""" + df = pd.DataFrame(np.random.normal(size=(10, 5)), columns=[f"Col_{i}" for i in range(5)]) + + # Test 1: Data has to be DoubleMLPanelData + invalid_data_types = [ + dml.data.DoubleMLData(df, y_col="Col_0", d_cols="Col_1"), + ] + + for invalid_data in invalid_data_types: + msg = r"For repeated outcomes the data must be of DoubleMLPanelData type\." + with pytest.raises(TypeError, match=msg): + _ = dml.did.DoubleMLDIDCSBinary( + obj_dml_data=invalid_data, + ml_g=LinearRegression(), + ml_m=LogisticRegression(), + g_value=1, + t_value_pre=0, + t_value_eval=1, + ) + + # Test 2: Data cannot have instrumental variables + df_with_z = dml_data.data.copy() + dml_data_with_z = dml.data.DoubleMLPanelData( + df_with_z, y_col="y", d_cols="d", id_col="id", t_col="t", z_cols=["Z1"], x_cols=["Z2", "Z3", "Z4"] + ) + + msg = r"Incompatible data. Z1 have been set as instrumental variable\(s\)." + with pytest.raises(NotImplementedError, match=msg): + _ = dml.did.DoubleMLDIDCSBinary( + obj_dml_data=dml_data_with_z, + ml_g=LinearRegression(), + ml_m=LogisticRegression(), + g_value=1, + t_value_pre=0, + t_value_eval=1, + ) + + # Test 3: Data must have exactly one treatment variable (using mock) + with patch.object(dml_data.__class__, "n_treat", property(lambda self: 2)): + msg = ( + "Incompatible data. To fit an DID model with DML exactly one variable needs to be specified as treatment variable." + ) + with pytest.raises(ValueError, match=msg): + _ = dml.did.DoubleMLDIDCSBinary( + obj_dml_data=dml_data, + ml_g=LinearRegression(), + ml_m=LogisticRegression(), + g_value=1, + t_value_pre=0, + t_value_eval=1, + ) + + +@pytest.mark.ci +def test_benchmark_warning(): + """Test warning when sensitivity_benchmark is called with experimental score""" + args = { + "obj_dml_data": dml_data, + "ml_g": LinearRegression(), + "ml_m": LogisticRegression(), + "g_value": 1, + "t_value_pre": 0, + "t_value_eval": 1, + "n_rep": 1, + } + # Create a DID model with experimental score + did_model = dml.did.DoubleMLDIDCSBinary(**args, score="experimental") + did_model.fit() + with pytest.warns(UserWarning, match="Sensitivity benchmarking for experimental score may not be meaningful"): + did_model.sensitivity_benchmark(["Z1", "Z2"]) From 8d0c52c54405a089e9171a5380185fc7ebff272a Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Jun 2025 15:02:54 +0200 Subject: [PATCH 42/84] simplify did_cs_binary nuisance estimation --- doubleml/did/did_cs_binary.py | 131 +++++++++++----------------------- 1 file changed, 40 insertions(+), 91 deletions(-) diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index a34dbf2a..fafcecf4 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -318,6 +318,34 @@ def _preprocess_data(self, g_value, pre_t, eval_t): data_subset = data_subset.assign(t_indicator=data_subset[t_col] == eval_t) return data_subset + def _estimate_conditional_g( + self, x, y, d_val, t_val, d_arr, t_arr, smpls_cond, external_prediction, learner_param_key, n_jobs_cv, return_models + ): + """Helper function to estimate conditional g_hat for fixed d and t.""" + g_hat_cond = {} + condition = (d_arr == d_val) & (t_arr == t_val) + + if external_prediction is not None: + ml_g_targets = np.full_like(y, np.nan, dtype="float64") + ml_g_targets[condition] = y[condition] + ml_pred = _get_id_positions(external_prediction, self.id_positions) + g_hat_cond = {"preds": ml_pred, "targets": ml_g_targets, "models": None} + else: + g_hat_cond = _dml_cv_predict( + self._learner["ml_g"], + x, + y, + smpls_cond, + n_jobs=n_jobs_cv, + est_params=self._get_params(learner_param_key), + method=self._predict_method["ml_g"], + return_models=return_models, + ) + _check_finite_predictions(g_hat_cond["preds"], self._learner["ml_g"], "ml_g", smpls_cond) + g_hat_cond["targets"] = g_hat_cond["targets"].astype(float) + g_hat_cond["targets"][~condition] = np.nan + return g_hat_cond + def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): # Here: d is a binary treatment indicator @@ -333,97 +361,18 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa # nuisance g smpls_d0_t0, smpls_d0_t1, smpls_d1_t0, smpls_d1_t1 = _get_cond_smpls_2d(smpls, d, t) - # nuisance g for d==0 & t==0 - if external_predictions["ml_g_d0_t0"] is not None: - ml_g_d0_t0_targets = np.full_like(y, np.nan, dtype="float64") - ml_g_d0_t0_targets[((d == 0) & (t == 0))] = y[((d == 0) & (t == 0))] - ml_d0_t0_pred = _get_id_positions(external_predictions["ml_g_d0_t0"], self.id_positions) - g_hat_d0_t0 = {"preds": ml_d0_t0_pred, "targets": ml_g_d0_t0_targets, "models": None} - else: - g_hat_d0_t0 = _dml_cv_predict( - self._learner["ml_g"], - x, - y, - smpls_d0_t0, - n_jobs=n_jobs_cv, - est_params=self._get_params("ml_g_d0_t0"), - method=self._predict_method["ml_g"], - return_models=return_models, - ) - - _check_finite_predictions(g_hat_d0_t0["preds"], self._learner["ml_g"], "ml_g", smpls) - # adjust target values to consider only compatible subsamples - g_hat_d0_t0["targets"] = g_hat_d0_t0["targets"].astype(float) - g_hat_d0_t0["targets"][np.invert((d == 0) & (t == 0))] = np.nan - - # nuisance g for d==0 & t==1 - if external_predictions["ml_g_d0_t1"] is not None: - ml_g_d0_t1_targets = np.full_like(y, np.nan, dtype="float64") - ml_g_d0_t1_targets[((d == 0) & (t == 1))] = y[((d == 0) & (t == 1))] - ml_d0_t1_pred = _get_id_positions(external_predictions["ml_g_d0_t1"], self.id_positions) - g_hat_d0_t1 = {"preds": ml_d0_t1_pred, "targets": ml_g_d0_t1_targets, "models": None} - else: - g_hat_d0_t1 = _dml_cv_predict( - self._learner["ml_g"], - x, - y, - smpls_d0_t1, - n_jobs=n_jobs_cv, - est_params=self._get_params("ml_g_d0_t1"), - method=self._predict_method["ml_g"], - return_models=return_models, - ) - - _check_finite_predictions(g_hat_d0_t1["preds"], self._learner["ml_g"], "ml_g", smpls) - # adjust target values to consider only compatible subsamples - g_hat_d0_t1["targets"] = g_hat_d0_t1["targets"].astype(float) - g_hat_d0_t1["targets"][np.invert((d == 0) & (t == 1))] = np.nan - - # nuisance g for d==1 & t==0 - if external_predictions["ml_g_d1_t0"] is not None: - ml_g_d1_t0_targets = np.full_like(y, np.nan, dtype="float64") - ml_g_d1_t0_targets[((d == 1) & (t == 0))] = y[((d == 1) & (t == 0))] - ml_d1_t0_pred = _get_id_positions(external_predictions["ml_g_d1_t0"], self.id_positions) - g_hat_d1_t0 = {"preds": ml_d1_t0_pred, "targets": ml_g_d1_t0_targets, "models": None} - else: - g_hat_d1_t0 = _dml_cv_predict( - self._learner["ml_g"], - x, - y, - smpls_d1_t0, - n_jobs=n_jobs_cv, - est_params=self._get_params("ml_g_d1_t0"), - method=self._predict_method["ml_g"], - return_models=return_models, - ) - - _check_finite_predictions(g_hat_d1_t0["preds"], self._learner["ml_g"], "ml_g", smpls) - # adjust target values to consider only compatible subsamples - g_hat_d1_t0["targets"] = g_hat_d1_t0["targets"].astype(float) - g_hat_d1_t0["targets"][np.invert((d == 1) & (t == 0))] = np.nan - - # nuisance g for d==1 & t==1 - if external_predictions["ml_g_d1_t1"] is not None: - ml_g_d1_t1_targets = np.full_like(y, np.nan, dtype="float64") - ml_g_d1_t1_targets[((d == 1) & (t == 1))] = y[((d == 1) & (t == 1))] - ml_d1_t1_pred = _get_id_positions(external_predictions["ml_g_d1_t1"], self.id_positions) - g_hat_d1_t1 = {"preds": ml_d1_t1_pred, "targets": ml_g_d1_t1_targets, "models": None} - else: - g_hat_d1_t1 = _dml_cv_predict( - self._learner["ml_g"], - x, - y, - smpls_d1_t1, - n_jobs=n_jobs_cv, - est_params=self._get_params("ml_g_d1_t1"), - method=self._predict_method["ml_g"], - return_models=return_models, - ) - - _check_finite_predictions(g_hat_d1_t1["preds"], self._learner["ml_g"], "ml_g", smpls) - # adjust target values to consider only compatible subsamples - g_hat_d1_t1["targets"] = g_hat_d1_t1["targets"].astype(float) - g_hat_d1_t1["targets"][np.invert((d == 1) & (t == 1))] = np.nan + g_hat_d0_t0 = self._estimate_conditional_g( + x, y, 0, 0, d, t, smpls_d0_t0, external_predictions["ml_g_d0_t0"], "ml_g_d0_t0", n_jobs_cv, return_models + ) + g_hat_d0_t1 = self._estimate_conditional_g( + x, y, 0, 1, d, t, smpls_d0_t1, external_predictions["ml_g_d0_t1"], "ml_g_d0_t1", n_jobs_cv, return_models + ) + g_hat_d1_t0 = self._estimate_conditional_g( + x, y, 1, 0, d, t, smpls_d1_t0, external_predictions["ml_g_d1_t0"], "ml_g_d1_t0", n_jobs_cv, return_models + ) + g_hat_d1_t1 = self._estimate_conditional_g( + x, y, 1, 1, d, t, smpls_d1_t1, external_predictions["ml_g_d1_t1"], "ml_g_d1_t1", n_jobs_cv, return_models + ) # only relevant for observational setting m_hat = {"preds": None, "targets": None, "models": None} From af45f7fb8b7dee34a480b7843054163a37d47efc Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Wed, 11 Jun 2025 08:34:48 +0200 Subject: [PATCH 43/84] add __str__ method to did_cs_binary --- doubleml/did/did_cs_binary.py | 53 +++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index fafcecf4..7788f4b3 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -156,6 +156,59 @@ def __init__( self._sensitivity_implemented = True self._external_predictions_implemented = True + def __str__(self): + class_name = self.__class__.__name__ + header = f"================== {class_name} Object ==================\n" + data_summary = self._dml_data._data_summary_str() + score_info = ( + f"Score function: {str(self.score)}\n" + f"Treatment group: {str(self.g_value)}\n" + f"Pre-treatment period: {str(self.t_value_pre)}\n" + f"Evaluation period: {str(self.t_value_eval)}\n" + f"Control group: {str(self.control_group)}\n" + f"Anticipation periods: {str(self.anticipation_periods)}\n" + f"Effective sample size: {str(self.n_obs_subset)}\n" + ) + learner_info = "" + for key, value in self.learner.items(): + learner_info += f"Learner {key}: {str(value)}\n" + if self.nuisance_loss is not None: + learner_info += "Out-of-sample Performance:\n" + is_classifier = [value for value in self._is_classifier.values()] + is_regressor = [not value for value in is_classifier] + if any(is_regressor): + learner_info += "Regression:\n" + for learner in [key for key, value in self._is_classifier.items() if value is False]: + learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n" + if any(is_classifier): + learner_info += "Classification:\n" + for learner in [key for key, value in self._is_classifier.items() if value is True]: + learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n" + + if self._is_cluster_data: + resampling_info = ( + f"No. folds per cluster: {self._n_folds_per_cluster}\n" + f"No. folds: {self.n_folds}\n" + f"No. repeated sample splits: {self.n_rep}\n" + ) + else: + resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n" + fit_summary = str(self.summary) + res = ( + header + + "\n------------------ Data summary ------------------\n" + + data_summary + + "\n------------------ Score & algorithm ------------------\n" + + score_info + + "\n------------------ Machine learner ------------------\n" + + learner_info + + "\n------------------ Resampling ------------------\n" + + resampling_info + + "\n------------------ Fit summary ------------------\n" + + fit_summary + ) + return res + @property def g_value(self): """ From 698f161945dadb75a4d2637e311c811c9542338a Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Wed, 11 Jun 2025 09:31:54 +0200 Subject: [PATCH 44/84] add test on panel data to did_cs binary --- doubleml/did/did_cs.py | 18 +- .../test_did_cs_binary_vs_did_cs_panel.py | 202 ++++++++++++++++++ 2 files changed, 215 insertions(+), 5 deletions(-) create mode 100644 doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py index 5984399c..8136f60c 100644 --- a/doubleml/did/did_cs.py +++ b/doubleml/did/did_cs.py @@ -227,7 +227,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa # nuisance g smpls_d0_t0, smpls_d0_t1, smpls_d1_t0, smpls_d1_t1 = _get_cond_smpls_2d(smpls, d, t) if external_predictions["ml_g_d0_t0"] is not None: - g_hat_d0_t0 = {"preds": external_predictions["ml_g_d0_t0"], "targets": None, "models": None} + g_hat_d0_t0_targets = np.full_like(y, np.nan, dtype="float64") + g_hat_d0_t0_targets[(d == 0) & (t == 0)] = y[(d == 0) & (t == 0)] + g_hat_d0_t0 = {"preds": external_predictions["ml_g_d0_t0"], "targets": g_hat_d0_t0_targets, "models": None} else: g_hat_d0_t0 = _dml_cv_predict( self._learner["ml_g"], @@ -243,7 +245,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa g_hat_d0_t0["targets"] = g_hat_d0_t0["targets"].astype(float) g_hat_d0_t0["targets"][np.invert((d == 0) & (t == 0))] = np.nan if external_predictions["ml_g_d0_t1"] is not None: - g_hat_d0_t1 = {"preds": external_predictions["ml_g_d0_t1"], "targets": None, "models": None} + g_hat_d0_t1_targets = np.full_like(y, np.nan, dtype="float64") + g_hat_d0_t1_targets[(d == 0) & (t == 1)] = y[(d == 0) & (t == 1)] + g_hat_d0_t1 = {"preds": external_predictions["ml_g_d0_t1"], "targets": g_hat_d0_t1_targets, "models": None} else: g_hat_d0_t1 = _dml_cv_predict( self._learner["ml_g"], @@ -258,7 +262,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa g_hat_d0_t1["targets"] = g_hat_d0_t1["targets"].astype(float) g_hat_d0_t1["targets"][np.invert((d == 0) & (t == 1))] = np.nan if external_predictions["ml_g_d1_t0"] is not None: - g_hat_d1_t0 = {"preds": external_predictions["ml_g_d1_t0"], "targets": None, "models": None} + g_hat_d1_t0_targets = np.full_like(y, np.nan, dtype="float64") + g_hat_d1_t0_targets[(d == 1) & (t == 0)] = y[(d == 1) & (t == 0)] + g_hat_d1_t0 = {"preds": external_predictions["ml_g_d1_t0"], "targets": g_hat_d1_t0_targets, "models": None} else: g_hat_d1_t0 = _dml_cv_predict( self._learner["ml_g"], @@ -273,7 +279,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa g_hat_d1_t0["targets"] = g_hat_d1_t0["targets"].astype(float) g_hat_d1_t0["targets"][np.invert((d == 1) & (t == 0))] = np.nan if external_predictions["ml_g_d1_t1"] is not None: - g_hat_d1_t1 = {"preds": external_predictions["ml_g_d1_t1"], "targets": None, "models": None} + g_hat_d1_t1_targets = np.full_like(y, np.nan, dtype="float64") + g_hat_d1_t1_targets[(d == 1) & (t == 1)] = y[(d == 1) & (t == 1)] + g_hat_d1_t1 = {"preds": external_predictions["ml_g_d1_t1"], "targets": g_hat_d1_t1_targets, "models": None} else: g_hat_d1_t1 = _dml_cv_predict( self._learner["ml_g"], @@ -293,7 +301,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa if self.score == "observational": # nuisance m if external_predictions["ml_m"] is not None: - m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None} + m_hat = {"preds": external_predictions["ml_m"], "targets": d, "models": None} else: m_hat = _dml_cv_predict( self._learner["ml_m"], diff --git a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py new file mode 100644 index 00000000..8fab2615 --- /dev/null +++ b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py @@ -0,0 +1,202 @@ +import math + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml +from doubleml.did.datasets import make_did_CS2021 +from doubleml.did.utils._did_utils import _get_id_positions + + +@pytest.fixture( + scope="module", + params=[ + [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)], + [ + RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42), + RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42), + ], + ], +) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def in_sample_normalization(request): + return request.param + + +@pytest.fixture(scope="module", params=[0.1]) +def trimming_threshold(request): + return request.param + + +@pytest.fixture(scope="module", params=["datetime", "float"]) +def time_type(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_did_binary_vs_did_fixture(time_type, learner, score, in_sample_normalization, trimming_threshold): + n_obs = 500 + dpg = 1 + + # collect data + df = make_did_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type) + dml_panel_data = dml.data.DoubleMLPanelData( + df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] + ) + + dml_args = { + "ml_g": clone(learner[0]), + "ml_m": clone(learner[1]), + "n_folds": 3, + "score": score, + "in_sample_normalization": in_sample_normalization, + "trimming_threshold": trimming_threshold, + "draw_sample_splitting": True, + } + + dml_did_binary_obj = dml.did.DoubleMLDIDCSBinary( + dml_panel_data, + g_value=dml_panel_data.g_values[0], + t_value_pre=dml_panel_data.t_values[0], + t_value_eval=dml_panel_data.t_values[1], + **dml_args, + ) + dml_did_binary_obj.fit() + + df_subset = dml_did_binary_obj.data_subset.copy() + dml_data = dml.data.DoubleMLData( + df_subset, y_col="y", d_cols="G_indicator", x_cols=["Z1", "Z2", "Z3", "Z4"], t_col="t_indicator" + ) + dml_did_obj = dml.DoubleMLDIDCS( + dml_data, + **dml_args, + ) + + # use external predictions (sample splitting is hard to synchronize) + ext_predictions = {"G_indicator": {}} + ext_predictions["G_indicator"]["ml_g_d0_t0"] = _get_id_positions( + dml_did_binary_obj.predictions["ml_g_d0_t0"][:, :, 0], dml_did_binary_obj._id_positions + ) + ext_predictions["G_indicator"]["ml_g_d0_t1"] = _get_id_positions( + dml_did_binary_obj.predictions["ml_g_d0_t1"][:, :, 0], dml_did_binary_obj._id_positions + ) + ext_predictions["G_indicator"]["ml_g_d1_t0"] = _get_id_positions( + dml_did_binary_obj.predictions["ml_g_d1_t0"][:, :, 0], dml_did_binary_obj._id_positions + ) + ext_predictions["G_indicator"]["ml_g_d1_t1"] = _get_id_positions( + dml_did_binary_obj.predictions["ml_g_d1_t1"][:, :, 0], dml_did_binary_obj._id_positions + ) + if score == "observational": + ext_predictions["G_indicator"]["ml_m"] = _get_id_positions( + dml_did_binary_obj.predictions["ml_m"][:, :, 0], dml_did_binary_obj._id_positions + ) + dml_did_obj.fit(external_predictions=ext_predictions) + + res_dict = { + "coef": dml_did_obj.coef, + "coef_binary": dml_did_binary_obj.coef, + "se": dml_did_obj.se, + "se_binary": dml_did_binary_obj.se, + "nuisance_loss": dml_did_obj.nuisance_loss, + "nuisance_loss_binary": dml_did_binary_obj.nuisance_loss, + "dml_did_binary_obj": dml_did_binary_obj, + } + + # sensitivity tests + res_dict["sensitivity_elements"] = dml_did_obj.sensitivity_elements + res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.sensitivity_elements + + dml_did_obj.sensitivity_analysis() + dml_did_binary_obj.sensitivity_analysis() + + res_dict["sensitivity_params"] = dml_did_obj.sensitivity_params + res_dict["sensitivity_params_binary"] = dml_did_binary_obj.sensitivity_params + + return res_dict + + +@pytest.mark.ci +def test_coefs(dml_did_binary_vs_did_fixture): + assert math.isclose( + dml_did_binary_vs_did_fixture["coef_binary"][0], dml_did_binary_vs_did_fixture["coef"][0], rel_tol=1e-9, abs_tol=1e-4 + ) + + +@pytest.mark.ci +def test_ses(dml_did_binary_vs_did_fixture): + assert math.isclose( + dml_did_binary_vs_did_fixture["se_binary"][0], dml_did_binary_vs_did_fixture["se"][0], rel_tol=1e-9, abs_tol=1e-4 + ) + + +# No Boostrap Tests as the observations are not ordered in the same way + + +@pytest.mark.ci +def test_nuisance_loss(dml_did_binary_vs_did_fixture): + assert ( + dml_did_binary_vs_did_fixture["nuisance_loss"].keys() == dml_did_binary_vs_did_fixture["nuisance_loss_binary"].keys() + ) + for key, value in dml_did_binary_vs_did_fixture["nuisance_loss"].items(): + assert np.allclose(value, dml_did_binary_vs_did_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3) + + +@pytest.mark.ci +def test_sensitivity_elements(dml_did_binary_vs_did_fixture): + sensitivity_element_names = ["sigma2", "nu2"] + for sensitivity_element in sensitivity_element_names: + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element], + dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element], + rtol=1e-9, + atol=1e-4, + ) + for sensitivity_element in ["psi_sigma2", "psi_nu2", "riesz_rep"]: + dml_binary_obj = dml_did_binary_vs_did_fixture["dml_did_binary_obj"] + scaling = dml_binary_obj.n_obs_subset / dml_binary_obj._dml_data.n_obs + binary_sensitivity_element = scaling * _get_id_positions( + dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element], dml_binary_obj._id_positions + ) + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element], + binary_sensitivity_element, + rtol=1e-9, + atol=1e-4, + ) + + +@pytest.mark.ci +def test_sensitivity_params(dml_did_binary_vs_did_fixture): + for key in ["theta", "se", "ci"]: + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_params"][key]["lower"], + dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key]["lower"], + rtol=1e-9, + atol=1e-4, + ) + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_params"][key]["upper"], + dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key]["upper"], + rtol=1e-9, + atol=1e-4, + ) + + for key in ["rv", "rva"]: + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_params"][key], + dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key], + rtol=1e-9, + atol=1e-4, + ) From 0a46b5966a993111b66405a069af64b6969d019e Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Wed, 11 Jun 2025 11:00:54 +0200 Subject: [PATCH 45/84] add panel type to did multi --- doubleml/did/did_multi.py | 38 +++- .../did/tests/test_did_multi_vs_binary.py | 2 +- .../did/tests/test_did_multi_vs_cs_binary.py | 208 ++++++++++++++++++ 3 files changed, 238 insertions(+), 10 deletions(-) create mode 100644 doubleml/did/tests/test_did_multi_vs_cs_binary.py diff --git a/doubleml/did/did_multi.py b/doubleml/did/did_multi.py index 8c5d5163..c8f54313 100644 --- a/doubleml/did/did_multi.py +++ b/doubleml/did/did_multi.py @@ -12,6 +12,7 @@ from doubleml.data import DoubleMLPanelData from doubleml.did.did_aggregation import DoubleMLDIDAggregation from doubleml.did.did_binary import DoubleMLDIDBinary +from doubleml.did.did_cs_binary import DoubleMLDIDCSBinary from doubleml.did.utils._aggregation import ( _check_did_aggregation_dict, _compute_did_eventstudy_aggregation_weights, @@ -31,7 +32,7 @@ from doubleml.did.utils._plot import add_jitter from doubleml.double_ml import DoubleML from doubleml.double_ml_framework import concat -from doubleml.utils._checks import _check_score, _check_trimming +from doubleml.utils._checks import _check_bool, _check_score, _check_trimming from doubleml.utils._descriptive import generate_summary from doubleml.utils.gain_statistics import gain_statistics @@ -80,6 +81,10 @@ class DoubleMLDIDMulti: from the pretreatment covariates. Default is ``'observational'``. + panel : bool + Indicates whether to rely on panel data structure (``True``) or repeated cross sections (``False``). + Default is ``True``. + in_sample_normalization : bool Indicates whether to use in-sample normalization of weights. Default is ``True``. @@ -140,6 +145,7 @@ def __init__( n_folds=5, n_rep=1, score="observational", + panel=True, in_sample_normalization=True, trimming_rule="truncate", trimming_threshold=1e-2, @@ -179,6 +185,9 @@ def __init__( valid_scores = ["observational", "experimental"] _check_score(self.score, valid_scores, allow_callable=False) + _check_bool(panel, "panel") + self._panel = panel + # initialize framework which is constructed after the fit method is called self._framework = None @@ -332,6 +341,13 @@ def never_treated_value(self): """ return self._never_treated_value + @property + def panel(self): + """ + Indicates whether to rely on panel data structure (``True``) or repeated cross sections (``False``). + """ + return self._panel + @property def in_sample_normalization(self): """ @@ -1250,7 +1266,10 @@ def _check_external_predictions(self, external_predictions): + f"Passed keys: {set(external_predictions.keys())}." ) - expected_learner_keys = ["ml_g0", "ml_g1", "ml_m"] + if self.panel: + expected_learner_keys = ["ml_g0", "ml_g1", "ml_m"] + else: + expected_learner_keys = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1", "ml_m"] for key, value in external_predictions.items(): if not isinstance(value, dict): raise TypeError( @@ -1268,12 +1287,7 @@ def _rename_external_predictions(self, external_predictions): d_col = self._dml_data.d_cols[0] ext_pred_dict = {gt_combination: {d_col: {}} for gt_combination in self.gt_labels} for gt_combination in self.gt_labels: - if "ml_g0" in external_predictions[gt_combination]: - ext_pred_dict[gt_combination][d_col]["ml_g0"] = external_predictions[gt_combination]["ml_g0"] - if "ml_g1" in external_predictions[gt_combination]: - ext_pred_dict[gt_combination][d_col]["ml_g1"] = external_predictions[gt_combination]["ml_g1"] - if "ml_m" in external_predictions[gt_combination]: - ext_pred_dict[gt_combination][d_col]["ml_m"] = external_predictions[gt_combination]["ml_m"] + ext_pred_dict[gt_combination][d_col].update(external_predictions[gt_combination]) return ext_pred_dict @@ -1304,9 +1318,15 @@ def _initialize_models(self): "draw_sample_splitting": True, "print_periods": self._print_periods, } + if self.panel: + ModelClass = DoubleMLDIDBinary + else: + ModelClass = DoubleMLDIDCSBinary + + # iterate over all group-time combinations for i_model, (g_value, t_value_pre, t_value_eval) in enumerate(self.gt_combinations): # initialize models for all levels - model = DoubleMLDIDBinary(g_value=g_value, t_value_pre=t_value_pre, t_value_eval=t_value_eval, **kwargs) + model = ModelClass(g_value=g_value, t_value_pre=t_value_pre, t_value_eval=t_value_eval, **kwargs) modellist[i_model] = model diff --git a/doubleml/did/tests/test_did_multi_vs_binary.py b/doubleml/did/tests/test_did_multi_vs_binary.py index 40b877b2..15d3fd0c 100644 --- a/doubleml/did/tests/test_did_multi_vs_binary.py +++ b/doubleml/did/tests/test_did_multi_vs_binary.py @@ -49,7 +49,7 @@ def dml_did_binary_vs_did_multi_fixture(time_type, learner, score, in_sample_nor n_obs = 500 dpg = 1 boot_methods = ["normal"] - n_rep_boot = 50000 + n_rep_boot = 500 # collect data df = make_did_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type) diff --git a/doubleml/did/tests/test_did_multi_vs_cs_binary.py b/doubleml/did/tests/test_did_multi_vs_cs_binary.py new file mode 100644 index 00000000..59886854 --- /dev/null +++ b/doubleml/did/tests/test_did_multi_vs_cs_binary.py @@ -0,0 +1,208 @@ +import math + +import numpy as np +import pytest +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml +from doubleml.did.datasets import make_did_CS2021 +from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor + + +@pytest.fixture( + scope="module", + params=[ + [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)], + [ + RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42), + RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42), + ], + ], +) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def in_sample_normalization(request): + return request.param + + +@pytest.fixture(scope="module", params=[0.1]) +def trimming_threshold(request): + return request.param + + +@pytest.fixture(scope="module", params=["datetime", "float"]) +def time_type(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_did_binary_vs_did_multi_fixture(time_type, learner, score, in_sample_normalization, trimming_threshold): + n_obs = 500 + dpg = 1 + boot_methods = ["normal"] + n_rep_boot = 500 + + # collect data + df = make_did_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type) + dml_panel_data = dml.data.DoubleMLPanelData( + df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] + ) + + dml_args = { + "n_folds": 3, + "score": score, + "in_sample_normalization": in_sample_normalization, + "trimming_threshold": trimming_threshold, + "draw_sample_splitting": True, + } + gt_combination = [(dml_panel_data.g_values[0], dml_panel_data.t_values[0], dml_panel_data.t_values[1])] + dml_did_multi_obj = dml.did.DoubleMLDIDMulti( + dml_panel_data, + ml_g=learner[0], + ml_m=learner[1], + gt_combinations=gt_combination, + panel=False, + **dml_args, + ) + dml_did_multi_obj.fit() + + treatment_col = dml_panel_data.d_cols[0] + ext_pred_dict = {treatment_col: {}} + all_keys = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1"] + for key in all_keys: + ext_pred_dict["d"][key] = dml_did_multi_obj.modellist[0].predictions[key][:, :, 0] + if score == "observational": + ext_pred_dict[treatment_col]["ml_m"] = dml_did_multi_obj.modellist[0].predictions["ml_m"][:, :, 0] + + dml_did_binary_obj = dml.did.DoubleMLDIDCSBinary( + dml_panel_data, + g_value=gt_combination[0][0], + t_value_pre=gt_combination[0][1], + t_value_eval=gt_combination[0][2], + ml_g=DMLDummyRegressor(), + ml_m=DMLDummyClassifier(), + **dml_args, + ) + dml_did_binary_obj.fit(external_predictions=ext_pred_dict) + + res_dict = { + "coef_multi": dml_did_multi_obj.coef, + "coef_binary": dml_did_binary_obj.coef, + "se_multi": dml_did_multi_obj.se, + "se_binary": dml_did_binary_obj.se, + "boot_methods": boot_methods, + "nuisance_loss_multi": dml_did_multi_obj.nuisance_loss, + "nuisance_loss_binary": dml_did_binary_obj.nuisance_loss, + } + + for bootstrap in boot_methods: + np.random.seed(3141) + dml_did_multi_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + np.random.seed(3141) + dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + + # approximately same ci (bootstrap not identical due to size of score) + res_dict["boot_ci" + bootstrap + "_multi"] = dml_did_multi_obj.confint(joint=True) + res_dict["boot_ci" + bootstrap + "_binary"] = dml_did_binary_obj.confint(joint=True) + + # sensitivity tests + res_dict["sensitivity_elements_multi"] = dml_did_multi_obj.sensitivity_elements + res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.framework.sensitivity_elements + + dml_did_multi_obj.sensitivity_analysis() + dml_did_binary_obj.sensitivity_analysis() + + res_dict["sensitivity_params_multi"] = dml_did_multi_obj.sensitivity_params + res_dict["sensitivity_params_binary"] = dml_did_binary_obj.sensitivity_params + + return res_dict + + +@pytest.mark.ci +def test_coefs(dml_did_binary_vs_did_multi_fixture): + assert math.isclose( + dml_did_binary_vs_did_multi_fixture["coef_binary"][0], + dml_did_binary_vs_did_multi_fixture["coef_multi"][0], + rel_tol=1e-9, + abs_tol=1e-4, + ) + + +@pytest.mark.ci +def test_se(dml_did_binary_vs_did_multi_fixture): + assert math.isclose( + dml_did_binary_vs_did_multi_fixture["se_binary"][0], + dml_did_binary_vs_did_multi_fixture["se_multi"][0], + rel_tol=1e-9, + abs_tol=1e-4, + ) + + +@pytest.mark.ci +def test_boot(dml_did_binary_vs_did_multi_fixture): + for bootstrap in dml_did_binary_vs_did_multi_fixture["boot_methods"]: + assert np.allclose( + dml_did_binary_vs_did_multi_fixture["boot_ci" + bootstrap + "_multi"].values, + dml_did_binary_vs_did_multi_fixture["boot_ci" + bootstrap + "_binary"].values, + atol=1e-2, + ) + + +@pytest.mark.ci +def test_nuisance_loss(dml_did_binary_vs_did_multi_fixture): + assert ( + dml_did_binary_vs_did_multi_fixture["nuisance_loss_multi"].keys() + == dml_did_binary_vs_did_multi_fixture["nuisance_loss_binary"].keys() + ) + for key, value in dml_did_binary_vs_did_multi_fixture["nuisance_loss_multi"].items(): + assert np.allclose(value, dml_did_binary_vs_did_multi_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3) + + +@pytest.mark.ci +def test_sensitivity_elements(dml_did_binary_vs_did_multi_fixture): + elements_multi = dml_did_binary_vs_did_multi_fixture["sensitivity_elements_multi"] + elements_binary = dml_did_binary_vs_did_multi_fixture["sensitivity_elements_binary"] + sensitivity_element_names = ["max_bias", "psi_max_bias", "sigma2", "nu2"] + for sensitivity_element in sensitivity_element_names: + assert np.allclose( + elements_multi[sensitivity_element], + elements_binary[sensitivity_element], + rtol=1e-9, + atol=1e-4, + ) + + +@pytest.mark.ci +def test_sensitivity_params(dml_did_binary_vs_did_multi_fixture): + multi_params = dml_did_binary_vs_did_multi_fixture["sensitivity_params_multi"] + binary_params = dml_did_binary_vs_did_multi_fixture["sensitivity_params_binary"] + for key in ["theta", "se", "ci"]: + assert np.allclose( + multi_params[key]["lower"], + binary_params[key]["lower"], + rtol=1e-9, + atol=1e-4, + ) + assert np.allclose( + multi_params[key]["upper"], + binary_params[key]["upper"], + rtol=1e-9, + atol=1e-4, + ) + + for key in ["rv", "rva"]: + assert np.allclose( + multi_params[key], + binary_params[key], + rtol=1e-9, + atol=1e-4, + ) From 45dfcf5a7fe98b1d700a21426ce27e81159b3985 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Wed, 11 Jun 2025 11:47:11 +0200 Subject: [PATCH 46/84] update single gt tests for did_cs --- .../did/tests/test_did_multi_aggregation_single_gt.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doubleml/did/tests/test_did_multi_aggregation_single_gt.py b/doubleml/did/tests/test_did_multi_aggregation_single_gt.py index 0f71d91b..a6ffcd49 100644 --- a/doubleml/did/tests/test_did_multi_aggregation_single_gt.py +++ b/doubleml/did/tests/test_did_multi_aggregation_single_gt.py @@ -27,6 +27,11 @@ def score(request): return request.param +@pytest.fixture(scope="module", params=[True, False]) +def panel(request): + return request.param + + @pytest.fixture(scope="module", params=[True, False]) def in_sample_normalization(request): return request.param @@ -43,7 +48,7 @@ def time_type(request): @pytest.fixture(scope="module") -def dml_single_gt_aggregation(aggregation, time_type, learner, score, in_sample_normalization, trimming_threshold): +def dml_single_gt_aggregation(aggregation, time_type, learner, score, panel, in_sample_normalization, trimming_threshold): n_obs = 500 dpg = 1 @@ -56,6 +61,7 @@ def dml_single_gt_aggregation(aggregation, time_type, learner, score, in_sample_ dml_args = { "n_folds": 3, "score": score, + "panel": panel, "in_sample_normalization": in_sample_normalization, "trimming_threshold": trimming_threshold, "draw_sample_splitting": True, From 29b0ee7114c7d1a99c5c22558c84f6a62e0d3403 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Wed, 11 Jun 2025 11:47:35 +0200 Subject: [PATCH 47/84] update exception tests for did cs --- doubleml/did/tests/test_did_multi_exceptions.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doubleml/did/tests/test_did_multi_exceptions.py b/doubleml/did/tests/test_did_multi_exceptions.py index aead8e48..88d373e3 100644 --- a/doubleml/did/tests/test_did_multi_exceptions.py +++ b/doubleml/did/tests/test_did_multi_exceptions.py @@ -18,6 +18,7 @@ "ml_g": LinearRegression(), "ml_m": LogisticRegression(), "gt_combinations": [(1, 0, 1)], + "panel": True, } @@ -43,6 +44,12 @@ def test_input(): invalid_arguments = {"control_group": 0} _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + # non boolean panel + msg = "panel has to be boolean. test of type was passed." + with pytest.raises(TypeError, match=msg): + invalid_arguments = {"panel": "test"} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + # propensity score adjustments msg = "in_sample_normalization indicator has to be boolean. Object of type passed." with pytest.raises(TypeError, match=msg): @@ -170,6 +177,12 @@ def test_check_external_predictions(): valid_pred = {model.gt_labels[0]: {"ml_g0": None, "ml_g1": None, "ml_m": None}} model._check_external_predictions(valid_pred) + model_cs = dml.did.DoubleMLDIDMulti(**valid_arguments | {"panel": False}) + valid_pred = { + model.gt_labels[0]: {"ml_g_d0_t0": None, "ml_g_d0_t1": None, "ml_g_d1_t0": None, "ml_g_d1_t1": None, "ml_m": None} + } + model_cs._check_external_predictions(valid_pred) + @pytest.mark.ci def test_exceptions_before_fit(): From 895a7627d2d3c444cb900cacfee5a0f475514556 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Wed, 11 Jun 2025 11:47:50 +0200 Subject: [PATCH 48/84] update external prediction tests for did cs --- .../tests/test_did_multi_external_predictions.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/doubleml/did/tests/test_did_multi_external_predictions.py b/doubleml/did/tests/test_did_multi_external_predictions.py index e336487d..9bafdc6f 100644 --- a/doubleml/did/tests/test_did_multi_external_predictions.py +++ b/doubleml/did/tests/test_did_multi_external_predictions.py @@ -14,6 +14,11 @@ def did_score(request): return request.param +@pytest.fixture(scope="module", params=[True, False]) +def panel(request): + return request.param + + @pytest.fixture(scope="module", params=[1, 3]) def n_rep(request): return request.param @@ -30,7 +35,7 @@ def set_ml_g_ext(request): @pytest.fixture(scope="module") -def doubleml_did_multi_ext_fixture(did_score, n_rep, set_ml_m_ext, set_ml_g_ext): +def doubleml_did_multi_ext_fixture(did_score, panel, n_rep, set_ml_m_ext, set_ml_g_ext): n_obs = 500 n_folds = 5 dgp = 1 @@ -47,6 +52,7 @@ def doubleml_did_multi_ext_fixture(did_score, n_rep, set_ml_m_ext, set_ml_g_ext) "obj_dml_data": dml_panel_data, "gt_combinations": [(2, 0, 1)], "score": did_score, + "panel": panel, "n_rep": n_rep, "n_folds": n_folds, } @@ -69,9 +75,12 @@ def doubleml_did_multi_ext_fixture(did_score, n_rep, set_ml_m_ext, set_ml_g_ext) ml_m_ext = ml_m if set_ml_g_ext: + g_keys = ["ml_g0", "ml_g1"] if panel else ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1"] for i_gt_combination, gt_label in enumerate(dml_obj.gt_labels): - ext_pred_dict[gt_label]["ml_g0"] = dml_obj.modellist[i_gt_combination].predictions["ml_g0"][:, :, 0] - ext_pred_dict[gt_label]["ml_g1"] = dml_obj.modellist[i_gt_combination].predictions["ml_g1"][:, :, 0] + predictions = dml_obj.modellist[i_gt_combination].predictions + for key in g_keys: + ext_pred_dict[gt_label][key] = predictions[key][:, :, 0] + ml_g_ext = DMLDummyRegressor() else: ml_g_ext = ml_g From b6ace7dae151340d2b2462e9a0d64af6d0b7ce0e Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Wed, 11 Jun 2025 11:48:02 +0200 Subject: [PATCH 49/84] update placebo tests for did cs multi --- doubleml/did/tests/test_did_multi_placebo.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doubleml/did/tests/test_did_multi_placebo.py b/doubleml/did/tests/test_did_multi_placebo.py index 8f01d426..12435871 100644 --- a/doubleml/did/tests/test_did_multi_placebo.py +++ b/doubleml/did/tests/test_did_multi_placebo.py @@ -12,13 +12,18 @@ def did_score(request): return request.param +@pytest.fixture(scope="module", params=[True, False]) +def panel(request): + return request.param + + @pytest.fixture(scope="module", params=[1, 3]) def n_rep(request): return request.param @pytest.fixture(scope="module") -def doubleml_did_fixture(did_score, n_rep): +def doubleml_did_fixture(did_score, panel, n_rep): n_obs = 1000 dgp = 5 # has to be experimental (for experimental score to be valid) np.random.seed(42) @@ -36,6 +41,7 @@ def doubleml_did_fixture(did_score, n_rep): "ml_m": LogisticRegression(), "gt_combinations": gt_combinations, "score": did_score, + "panel": panel, "n_rep": n_rep, "n_folds": 5, "draw_sample_splitting": True, From 176a99d8b3ac2dd8728a5a08368be3d974f23e86 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Wed, 11 Jun 2025 13:34:06 +0200 Subject: [PATCH 50/84] update plot and return type tests for did multi --- doubleml/did/did_multi.py | 5 ++ doubleml/did/tests/test_did_multi_plot.py | 8 +++- .../did/tests/test_did_multi_return_types.py | 47 ++++++++++++++----- 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/doubleml/did/did_multi.py b/doubleml/did/did_multi.py index c8f54313..646ad41d 100644 --- a/doubleml/did/did_multi.py +++ b/doubleml/did/did_multi.py @@ -187,6 +187,11 @@ def __init__( _check_bool(panel, "panel") self._panel = panel + # set score dim (n_elements, n_thetas, n_rep), just for checking purposes + if self.panel: + self._score_dim = (self._dml_data.n_ids, self.n_gt_atts, self.n_rep) + else: + self._score_dim = (self._dml_data.n_obs, self.n_gt_atts, self.n_rep) # initialize framework which is constructed after the fit method is called self._framework = None diff --git a/doubleml/did/tests/test_did_multi_plot.py b/doubleml/did/tests/test_did_multi_plot.py index 2eb15dcc..bcb8b786 100644 --- a/doubleml/did/tests/test_did_multi_plot.py +++ b/doubleml/did/tests/test_did_multi_plot.py @@ -13,13 +13,18 @@ def did_score(request): return request.param +@pytest.fixture(scope="module", params=[True, False]) +def panel(request): + return request.param + + @pytest.fixture(scope="module", params=[1, 3]) def n_rep(request): return request.param @pytest.fixture(scope="module") -def doubleml_did_fixture(did_score, n_rep): +def doubleml_did_fixture(did_score, panel, n_rep): n_obs = 1000 dgp = 5 # has to be experimental (for experimental score to be valid) np.random.seed(42) @@ -32,6 +37,7 @@ def doubleml_did_fixture(did_score, n_rep): "ml_m": LogisticRegression(), "gt_combinations": "all", "score": did_score, + "panel": panel, "n_rep": n_rep, "n_folds": 2, "draw_sample_splitting": True, diff --git a/doubleml/did/tests/test_did_multi_return_types.py b/doubleml/did/tests/test_did_multi_return_types.py index c11544ed..d797230e 100644 --- a/doubleml/did/tests/test_did_multi_return_types.py +++ b/doubleml/did/tests/test_did_multi_return_types.py @@ -13,7 +13,7 @@ from doubleml.double_ml_framework import DoubleMLFramework # Test constants -N_OBS = 200 +N_IDS = 200 N_REP = 1 N_FOLDS = 3 N_REP_BOOT = 314 @@ -31,7 +31,7 @@ datasets = {} # panel data -df_panel = make_did_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=N_PERIODS, time_type="float") +df_panel = make_did_CS2021(n_obs=N_IDS, dgp_type=1, n_pre_treat_periods=2, n_periods=N_PERIODS, time_type="float") df_panel["y_binary"] = np.random.binomial(n=1, p=0.5, size=df_panel.shape[0]) datasets["did_panel"] = DoubleMLPanelData( df_panel, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] @@ -42,10 +42,23 @@ dml_objs = [ - (DoubleMLDIDMulti(datasets["did_panel"], ml_g=Lasso(), ml_m=LogisticRegression(), **dml_args), DoubleMLDIDMulti), + ( + DoubleMLDIDMulti(datasets["did_panel"], panel=True, ml_g=Lasso(), ml_m=LogisticRegression(), **dml_args), + DoubleMLDIDMulti, + ), + ( + DoubleMLDIDMulti(datasets["did_panel"], panel=False, ml_g=Lasso(), ml_m=LogisticRegression(), **dml_args), + DoubleMLDIDMulti, + ), + ( + DoubleMLDIDMulti( + datasets["did_panel_binary_outcome"], panel=True, ml_g=LogisticRegression(), ml_m=LogisticRegression(), **dml_args + ), + DoubleMLDIDMulti, + ), ( DoubleMLDIDMulti( - datasets["did_panel_binary_outcome"], ml_g=LogisticRegression(), ml_m=LogisticRegression(), **dml_args + datasets["did_panel_binary_outcome"], panel=False, ml_g=LogisticRegression(), ml_m=LogisticRegression(), **dml_args ), DoubleMLDIDMulti, ), @@ -84,13 +97,20 @@ def test_panel_property_types_and_shapes(fitted_dml_obj): n_treat = len(fitted_dml_obj.gt_combinations) dml_obj = fitted_dml_obj + if dml_obj.panel: + score_dim = (N_IDS, n_treat, N_REP) + else: + score_dim = (df_panel.shape[0], n_treat, N_REP) + + assert dml_obj._score_dim == score_dim + # check_basic_property_types_and_shapes # check that the setting is still in line with the hard-coded values assert dml_obj._dml_data.n_treat == 1 assert dml_obj.n_gt_atts == n_treat assert dml_obj.n_rep == N_REP assert dml_obj.n_folds == N_FOLDS - assert dml_obj._dml_data.n_obs == N_OBS * N_PERIODS + assert dml_obj._dml_data.n_obs == df_panel.shape[0] assert dml_obj.n_rep_boot == N_REP_BOOT assert isinstance(dml_obj.all_coef, np.ndarray) @@ -112,11 +132,7 @@ def test_panel_property_types_and_shapes(fitted_dml_obj): assert dml_obj.t_stat.shape == (n_treat,) assert isinstance(dml_obj.framework.scaled_psi, np.ndarray) - assert dml_obj.framework.scaled_psi.shape == ( - N_OBS, - n_treat, - N_REP, - ) + assert dml_obj.framework.scaled_psi.shape == score_dim assert isinstance(dml_obj.framework, DoubleMLFramework) assert isinstance(dml_obj.pval, np.ndarray) @@ -126,7 +142,10 @@ def test_panel_property_types_and_shapes(fitted_dml_obj): assert len(dml_obj._dml_data.binary_treats) == 1 # check_basic_predictions_and_targets - expected_keys = ["ml_g0", "ml_g1", "ml_m"] + if dml_obj.panel: + expected_keys = ["ml_g0", "ml_g1", "ml_m"] + else: + expected_keys = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1", "ml_m"] for key in expected_keys: assert isinstance(dml_obj.nuisance_loss[key], np.ndarray) assert dml_obj.nuisance_loss[key].shape == (N_REP, n_treat) @@ -137,6 +156,10 @@ def test_panel_sensitivity_return_types(fitted_dml_obj): n_treat = len(fitted_dml_obj.gt_combinations) benchmarking_set = [fitted_dml_obj._dml_data.x_cols[0]] dml_obj = fitted_dml_obj + if dml_obj.panel: + score_dim = (N_IDS, n_treat, N_REP) + else: + score_dim = (df_panel.shape[0], n_treat, N_REP) assert isinstance(dml_obj.sensitivity_elements, dict) for key in ["sigma2", "nu2", "max_bias"]: @@ -144,7 +167,7 @@ def test_panel_sensitivity_return_types(fitted_dml_obj): assert dml_obj.sensitivity_elements[key].shape == (1, n_treat, N_REP) for key in ["psi_max_bias"]: assert isinstance(dml_obj.sensitivity_elements[key], np.ndarray) - assert dml_obj.sensitivity_elements[key].shape == (N_OBS, n_treat, N_REP) + assert dml_obj.sensitivity_elements[key].shape == score_dim assert isinstance(dml_obj.sensitivity_summary, str) dml_obj.sensitivity_analysis() From 9d59e5b1e378a2aa9c573b92aad02267cf3da586 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Wed, 11 Jun 2025 13:53:45 +0200 Subject: [PATCH 51/84] add additional did multi aggregation test --- ...st_did_multi_aggregation_manual_weights.py | 199 +++++++++++++++++- ...test_did_multi_aggregation_weight_index.py | 1 - 2 files changed, 198 insertions(+), 2 deletions(-) delete mode 100644 doubleml/did/tests/test_did_multi_aggregation_weight_index.py diff --git a/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py b/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py index 35512d8f..57b00b31 100644 --- a/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py +++ b/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py @@ -1 +1,198 @@ -# TODO: For each aggregation method check if the manual weights equal the string aggregation method. +import math + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml +from doubleml.did.datasets import make_did_CS2021 +from doubleml.did.utils._aggregation import ( + _compute_did_eventstudy_aggregation_weights, + _compute_did_group_aggregation_weights, + _compute_did_time_aggregation_weights, +) + + +@pytest.fixture(scope="module", params=["group", "time", "eventstudy"]) +def aggregation_method(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def panel(request): + return request.param + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_fitted_obj(panel, score): + """Create a fitted DML object for testing.""" + n_obs = 200 + + # Create data + df = make_did_CS2021(n_obs=n_obs, dgp_type=1, time_type="float") + dml_data = dml.data.DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) + + # Create and fit model + ml_g = LinearRegression() + ml_m = LogisticRegression(solver="lbfgs", max_iter=250) + + dml_obj = dml.did.DoubleMLDIDMulti( + obj_dml_data=dml_data, + ml_g=ml_g, + ml_m=ml_m, + gt_combinations="standard", + panel=panel, + score=score, + n_folds=3, + n_rep=1, + ) + dml_obj.fit() + + return dml_obj + + +def _extract_manual_weights(dml_obj, aggregation_method): + """Extract manual weights from the aggregation method.""" + # Get the mask for non-masked values + selected_gt_mask = ~dml_obj.gt_index.mask + + if aggregation_method == "group": + # Exclude pre-treatment combinations for group aggregation + selected_gt_mask = selected_gt_mask & dml_obj._post_treatment_mask + aggregation_dict = _compute_did_group_aggregation_weights( + gt_index=dml_obj.gt_index, + g_values=dml_obj.g_values, + d_values=dml_obj._dml_data.d, + selected_gt_mask=selected_gt_mask, + ) + aggregation_dict["method"] = "Group" + elif aggregation_method == "time": + # Exclude pre-treatment combinations for time aggregation + selected_gt_mask = selected_gt_mask & dml_obj._post_treatment_mask + aggregation_dict = _compute_did_time_aggregation_weights( + gt_index=dml_obj.gt_index, + g_values=dml_obj.g_values, + t_values=dml_obj.t_values, + d_values=dml_obj._dml_data.d, + selected_gt_mask=selected_gt_mask, + ) + aggregation_dict["method"] = "Time" + else: + assert aggregation_method == "eventstudy" + aggregation_dict = _compute_did_eventstudy_aggregation_weights( + gt_index=dml_obj.gt_index, + g_values=dml_obj.g_values, + t_values=dml_obj.t_values, + d_values=dml_obj._dml_data.d, + time_values=dml_obj._dml_data.t, + selected_gt_mask=selected_gt_mask, + ) + aggregation_dict["method"] = "Event Study" + return aggregation_dict + + +@pytest.mark.ci +def test_string_vs_manual_weights_aggregation(dml_fitted_obj, aggregation_method): + """Test that string aggregation methods produce identical results to manual weights.""" + + # Get string-based aggregation result + agg_string = dml_fitted_obj.aggregate(aggregation=aggregation_method) + + # Extract manual weights + manual_weights_dict = _extract_manual_weights(dml_fitted_obj, aggregation_method) + + # Get manual aggregation result + agg_manual = dml_fitted_obj.aggregate(aggregation=manual_weights_dict) + + # Compare aggregated frameworks - coefficients + np.testing.assert_allclose( + agg_string.aggregated_frameworks.thetas, + agg_manual.aggregated_frameworks.thetas, + rtol=1e-9, + atol=1e-12, + ) + + # Compare aggregated frameworks - standard errors + np.testing.assert_allclose( + agg_string.aggregated_frameworks.ses, + agg_manual.aggregated_frameworks.ses, + rtol=1e-9, + atol=1e-12, + ) + + # Compare overall aggregated framework - coefficients + np.testing.assert_allclose( + agg_string.overall_aggregated_framework.thetas, + agg_manual.overall_aggregated_framework.thetas, + rtol=1e-9, + atol=1e-12, + ) + + # Compare overall aggregated framework - standard errors + np.testing.assert_allclose( + agg_string.overall_aggregated_framework.ses, + agg_manual.overall_aggregated_framework.ses, + rtol=1e-9, + atol=1e-12, + ) + + # Compare aggregation weights + np.testing.assert_allclose( + agg_string.aggregation_weights, + agg_manual.aggregation_weights, + rtol=1e-9, + atol=1e-12, + ) + + # Compare overall aggregation weights + np.testing.assert_allclose( + agg_string.overall_aggregation_weights, + agg_manual.overall_aggregation_weights, + rtol=1e-9, + atol=1e-12, + ) + + # Compare aggregation names + assert agg_string.aggregation_names == agg_manual.aggregation_names + + # Compare number of aggregations + assert agg_string.n_aggregations == agg_manual.n_aggregations + + +@pytest.mark.ci +def test_manual_weights_properties(dml_fitted_obj, aggregation_method): + """Test that manual weights have the expected properties.""" + + manual_weights_dict = _extract_manual_weights(dml_fitted_obj, aggregation_method) + + # Check that required keys are present + assert "weight_masks" in manual_weights_dict + assert "agg_names" in manual_weights_dict + assert "agg_weights" in manual_weights_dict + + weight_masks = manual_weights_dict["weight_masks"] + agg_weights = manual_weights_dict["agg_weights"] + + # Check weight masks properties + assert isinstance(weight_masks, np.ma.MaskedArray) + assert weight_masks.ndim == 4 + assert weight_masks.shape[:-1] == dml_fitted_obj.gt_index.shape + + # Check that aggregation weights sum to 1 + assert math.isclose(np.sum(agg_weights), 1.0, rel_tol=1e-9, abs_tol=1e-12) + + # Check that individual weight masks sum to 1 (for non-masked elements) + n_aggregations = weight_masks.shape[-1] + for i in range(n_aggregations): + weights = weight_masks[..., i].compressed() + if len(weights) > 0: + assert math.isclose(np.sum(weights), 1.0, rel_tol=1e-9, abs_tol=1e-12) + + # Check that weight masks have the same mask as gt_index + for i in range(n_aggregations): + np.testing.assert_array_equal(weight_masks[..., i].mask, dml_fitted_obj.gt_index.mask) diff --git a/doubleml/did/tests/test_did_multi_aggregation_weight_index.py b/doubleml/did/tests/test_did_multi_aggregation_weight_index.py deleted file mode 100644 index d001a4a8..00000000 --- a/doubleml/did/tests/test_did_multi_aggregation_weight_index.py +++ /dev/null @@ -1 +0,0 @@ -# TODO: For each aggregation method check if the aggregated weights correspond to certain gt_combinations (group, time etc.) From f27bf2068ec30a64c1b8b37af4628172eb05b3d5 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Wed, 11 Jun 2025 14:55:49 +0200 Subject: [PATCH 52/84] some progress on refactoring the data backends. --- doubleml/data/tests/test_cluster_data.py | 53 ++++++++++-------------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/doubleml/data/tests/test_cluster_data.py b/doubleml/data/tests/test_cluster_data.py index 4489e528..9de9294c 100644 --- a/doubleml/data/tests/test_cluster_data.py +++ b/doubleml/data/tests/test_cluster_data.py @@ -2,7 +2,7 @@ import pandas as pd import pytest -from doubleml import DoubleMLData +from doubleml import DoubleMLData, DoubleMLDIDData, DoubleMLSSMData from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 @@ -11,29 +11,29 @@ def test_obj_vs_from_arrays(): np.random.seed(3141) dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) dml_data_from_array = DoubleMLData.from_arrays( - dml_data.data[dml_data.x_cols], - dml_data.data[dml_data.y_col], - dml_data.data[dml_data.d_cols], - dml_data.data[dml_data.cluster_cols], - dml_data.data[dml_data.z_cols], + x=dml_data.data[dml_data.x_cols], + y=dml_data.data[dml_data.y_col], + d=dml_data.data[dml_data.d_cols], + cluster_vars=dml_data.data[dml_data.cluster_cols], + z=dml_data.data[dml_data.z_cols], ) df = dml_data.data.copy() df.rename( columns={"cluster_var_i": "cluster_var1", "cluster_var_j": "cluster_var2", "Y": "y", "D": "d", "Z": "z"}, inplace=True ) - assert dml_data_from_array.data.equals(df) + assert dml_data_from_array.data[list(df.columns)].equals(df) # with a single cluster variable dml_data_from_array = DoubleMLData.from_arrays( - dml_data.data[dml_data.x_cols], - dml_data.data[dml_data.y_col], - dml_data.data[dml_data.d_cols], - dml_data.data[dml_data.cluster_cols[1]], - dml_data.data[dml_data.z_cols], + x=dml_data.data[dml_data.x_cols], + y=dml_data.data[dml_data.y_col], + d=dml_data.data[dml_data.d_cols], + cluster_vars=dml_data.data[dml_data.cluster_cols[1]], + z=dml_data.data[dml_data.z_cols], ) df = dml_data.data.copy().drop(columns="cluster_var_i") df.rename(columns={"cluster_var_j": "cluster_var", "Y": "y", "D": "d", "Z": "z"}, inplace=True) - assert dml_data_from_array.data.equals(df) + assert dml_data_from_array.data[list(df.columns)].equals(df) @pytest.mark.ci @@ -53,32 +53,22 @@ def test_x_cols_setter_defaults_w_cluster(): # without instrument and with time df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt") + dml_data = DoubleMLDIDData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt") assert dml_data.x_cols == ["xx1", "xx2"] # with instrument and with time df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt") + dml_data = DoubleMLDIDData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt") assert dml_data.x_cols == ["xx1", "xx2"] # without instrument and with selection df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "ss", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", s_col="ss") + dml_data = DoubleMLSSMData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", s_col="ss") assert dml_data.x_cols == ["xx1", "xx2"] # with instrument and with selection df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "ss", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", s_col="ss") - assert dml_data.x_cols == ["xx1", "xx2"] - - # without instrument with time with selection - df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "ss", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt", s_col="ss") - assert dml_data.x_cols == ["xx1", "xx2"] - - # with instrument with time with selection - df = pd.DataFrame(np.tile(np.arange(8), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "ss", "cluster1"]) - dml_data = DoubleMLData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt", s_col="ss") + dml_data = DoubleMLSSMData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", s_col="ss") assert dml_data.x_cols == ["xx1", "xx2"] @@ -107,7 +97,7 @@ def test_cluster_cols_setter(): with pytest.raises(ValueError, match=msg): dml_data.cluster_cols = "X13" - msg = r"The cluster variable\(s\) cluster_cols must be of str or list type. " "5 of type was passed." + msg = r"The cluster variable\(s\) cluster_cols must be of str or list type (or None). " "5 of type was passed." with pytest.raises(TypeError, match=msg): dml_data.cluster_cols = 5 @@ -154,14 +144,14 @@ def test_disjoint_sets(): r"and cluster variable\(s\) \(``cluster_cols``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], t_col="xx2", cluster_cols="xx2") + _ = DoubleMLDIDData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], t_col="xx2", cluster_cols="xx2") msg = ( r"At least one variable/column is set as score or selection variable \(``s_col``\) " r"and cluster variable\(s\) \(``cluster_cols``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], s_col="xx2", cluster_cols="xx2") + _ = DoubleMLSSMData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], s_col="xx2", cluster_cols="xx2") @pytest.mark.ci @@ -215,14 +205,13 @@ def test_cluster_data_str(): df["time_var"] = 1 df["score_var"] = 0.5 - dml_data_with_optional = DoubleMLData( + dml_data_with_optional = DoubleMLDIDData( data=df, y_col="Y", d_cols="D", cluster_cols=["cluster_var_i", "cluster_var_j"], z_cols="Z", t_col="time_var", - s_col="score_var", ) dml_str_optional = str(dml_data_with_optional) From 9e3e6d62a00a9cf3b3394476cd7fbf71e30ea31f Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Thu, 12 Jun 2025 09:25:55 +0200 Subject: [PATCH 53/84] update did cs multi test for cs data --- doubleml/did/tests/test_did_multi_vs_cs_binary.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/doubleml/did/tests/test_did_multi_vs_cs_binary.py b/doubleml/did/tests/test_did_multi_vs_cs_binary.py index 59886854..7af8d74d 100644 --- a/doubleml/did/tests/test_did_multi_vs_cs_binary.py +++ b/doubleml/did/tests/test_did_multi_vs_cs_binary.py @@ -6,7 +6,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.did.datasets import make_did_CS2021 +from doubleml.did.datasets import make_did_cs_CS2021 from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor @@ -44,15 +44,20 @@ def time_type(request): return request.param +@pytest.fixture(scope="module", params=[0.5, 0.1]) +def lambda_t(request): + return request.param + + @pytest.fixture(scope="module") -def dml_did_binary_vs_did_multi_fixture(time_type, learner, score, in_sample_normalization, trimming_threshold): +def dml_did_binary_vs_did_multi_fixture(time_type, lambda_t, learner, score, in_sample_normalization, trimming_threshold): n_obs = 500 dpg = 1 boot_methods = ["normal"] n_rep_boot = 500 # collect data - df = make_did_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type) + df = make_did_cs_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type, lambda_t=lambda_t) dml_panel_data = dml.data.DoubleMLPanelData( df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] ) From 5c4d1e25a2c0e9560e6af3f01ac287e933367f81 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Thu, 12 Jun 2025 12:57:17 +0200 Subject: [PATCH 54/84] update did binary to work with unbalanced panels --- doubleml/did/did_binary.py | 7 +- ..._binary_external_predictions_unbalanced.py | 93 +++++++++++++++++++ 2 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 doubleml/did/tests/test_did_binary_external_predictions_unbalanced.py diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py index a9939c97..6fa19e0d 100644 --- a/doubleml/did/did_binary.py +++ b/doubleml/did/did_binary.py @@ -421,9 +421,10 @@ def _preprocess_data(self, g_value, pre_t, eval_t): id_col = self._dml_data.id_col g_col = self._dml_data.g_col - # relevent data subset - data_subset_indicator = data[t_col].isin([pre_t, eval_t]) - data_subset = data[data_subset_indicator].sort_values(by=[id_col, t_col]) + # relevent data subset: Only include units which are observed in both periods + relevant_time_data = data[data[t_col].isin([pre_t, eval_t])] + ids_with_both_periods_filter = relevant_time_data.groupby(id_col)[t_col].transform("nunique") == 2 + data_subset = relevant_time_data[ids_with_both_periods_filter].sort_values(by=[id_col, t_col]) # Construct G (treatment group) indicating treatment period in g G_indicator = (data_subset[g_col] == g_value).astype(int) diff --git a/doubleml/did/tests/test_did_binary_external_predictions_unbalanced.py b/doubleml/did/tests/test_did_binary_external_predictions_unbalanced.py new file mode 100644 index 00000000..ffeadb51 --- /dev/null +++ b/doubleml/did/tests/test_did_binary_external_predictions_unbalanced.py @@ -0,0 +1,93 @@ +import math + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +from doubleml.data import DoubleMLPanelData +from doubleml.did import DoubleMLDIDBinary +from doubleml.did.datasets import make_did_cs_CS2021 +from doubleml.tests._utils import draw_smpls +from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def did_score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_did_panel_fixture(did_score, n_rep): + n_obs = 500 + n_folds = 5 + dgp = 1 + + ext_predictions = {"d": {}} + df = make_did_cs_CS2021(n_obs=n_obs, dgp_type=dgp, time_type="float") + dml_panel_data = DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) + + kwargs = { + "obj_dml_data": dml_panel_data, + "g_value": 2, + "t_value_pre": 0, + "t_value_eval": 1, + "score": did_score, + "n_rep": n_rep, + "draw_sample_splitting": False, + } + + dml_did = DoubleMLDIDBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs) + all_smpls = draw_smpls(n_obs=dml_did.n_obs_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_panel) + dml_did.set_sample_splitting(all_smpls) + + np.random.seed(3141) + dml_did.fit(store_predictions=True) + + all_keys = ["ml_g0", "ml_g1"] + for key in all_keys: + ext_predictions["d"][key] = dml_did.predictions[key][:, :, 0] + if did_score == "observational": + ext_predictions["d"]["ml_m"] = dml_did.predictions["ml_m"][:, :, 0] + dml_did_ext = DoubleMLDIDBinary(ml_g=DMLDummyRegressor(), ml_m=DMLDummyClassifier(), **kwargs) + dml_did_ext.set_sample_splitting(all_smpls) + np.random.seed(3141) + dml_did_ext.fit(external_predictions=ext_predictions) + + res_dict = { + "coef": dml_did.coef[0], + "coef_ext": dml_did_ext.coef[0], + "se": dml_did.se[0], + "se_ext": dml_did_ext.se[0], + "score": dml_did.psi, + "score_ext": dml_did_ext.psi, + "dml_did_nuisance_loss": dml_did.nuisance_loss, + "dml_did_ext_nuisance_loss": dml_did_ext.nuisance_loss, + } + + return res_dict + + +@pytest.mark.ci +def test_panel_coef(doubleml_did_panel_fixture): + assert math.isclose(doubleml_did_panel_fixture["coef"], doubleml_did_panel_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3) + + +@pytest.mark.ci +def test_panel_se(doubleml_did_panel_fixture): + assert math.isclose(doubleml_did_panel_fixture["se"], doubleml_did_panel_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3) + + +@pytest.mark.ci +def test_panel_score(doubleml_did_panel_fixture): + assert np.allclose(doubleml_did_panel_fixture["score"], doubleml_did_panel_fixture["score_ext"], rtol=1e-9, atol=1e-3) + + +@pytest.mark.ci +def test_panel_nuisance_loss(doubleml_did_panel_fixture): + for key, value in doubleml_did_panel_fixture["dml_did_nuisance_loss"].items(): + assert np.allclose(value, doubleml_did_panel_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3) From 8437d79f0edf50f69bc2de690762c951c11a0cad Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 12 Jun 2025 14:55:52 +0200 Subject: [PATCH 55/84] formatting issue --- doubleml/data/base_data.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/doubleml/data/base_data.py b/doubleml/data/base_data.py index 9ba8bc00..2297944e 100644 --- a/doubleml/data/base_data.py +++ b/doubleml/data/base_data.py @@ -11,10 +11,7 @@ class DoubleMLBaseData(ABC): - """Bas x_cols = [f"X{i + 1}" for i in np.arange(x.shape[1])] - # baseline version with features, outcome and treatments - data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + [y_col] + d_cols)Class Double machine learning data-backends - """ + """Base Class Double machine learning data-backends""" def __init__(self, data): if not isinstance(data, pd.DataFrame): From e58f55038ca173293dc2a6e0d41b6d8f2ecadb1b Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 12 Jun 2025 15:06:07 +0200 Subject: [PATCH 56/84] updt. unit tests --- doubleml/data/tests/test_cluster_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doubleml/data/tests/test_cluster_data.py b/doubleml/data/tests/test_cluster_data.py index 9de9294c..91627158 100644 --- a/doubleml/data/tests/test_cluster_data.py +++ b/doubleml/data/tests/test_cluster_data.py @@ -140,15 +140,15 @@ def test_disjoint_sets(): _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], z_cols=["xx2"], cluster_cols="xx2") msg = ( - r"At least one variable/column is set as time variable \(``t_col``\) " - r"and cluster variable\(s\) \(``cluster_cols``\)." + r"At least one variable/column is set as cluster variable\(s\) \(``cluster_cols``\) " + r"and time variable \(``t_col``\)." ) with pytest.raises(ValueError, match=msg): _ = DoubleMLDIDData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], t_col="xx2", cluster_cols="xx2") msg = ( - r"At least one variable/column is set as score or selection variable \(``s_col``\) " - r"and cluster variable\(s\) \(``cluster_cols``\)." + r"At least one variable/column is set as cluster variable\(s\) \(``cluster_cols``\) " + r"and selection variable \(``s_col``\)." ) with pytest.raises(ValueError, match=msg): _ = DoubleMLSSMData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], s_col="xx2", cluster_cols="xx2") From a2deba93923426341520611aaea0f2158819f10a Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 12 Jun 2025 15:17:41 +0200 Subject: [PATCH 57/84] fix cluster DGP to use corret data backend --- .../datasets/dgp_pliv_multiway_cluster_CKMS2021.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py index df2b4cbe..39ff6a26 100644 --- a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py +++ b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py @@ -2,11 +2,11 @@ import pandas as pd from scipy.linalg import toeplitz -from doubleml.data import DoubleMLClusterData -from doubleml.utils._aliases import _array_alias, _data_frame_alias, _dml_cluster_data_alias +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _array_alias, _data_frame_alias, _dml_data_alias -def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return_type="DoubleMLClusterData", **kwargs): +def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return_type="DoubleMLData", **kwargs): """ Generates data from a partially linear IV regression model with multiway cluster sample used in Chiang et al. (2021). The data generating process is defined as @@ -188,12 +188,14 @@ def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return if return_type in _array_alias: return x, y, d, cluster_vars.values, z - elif return_type in _data_frame_alias + _dml_cluster_data_alias: + elif return_type in _data_frame_alias + _dml_data_alias: x_cols = [f"X{i + 1}" for i in np.arange(dim_X)] data = pd.concat((cluster_vars, pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["Y", "D", "Z"])), axis=1) if return_type in _data_frame_alias: return data else: - return DoubleMLClusterData(data, "Y", "D", cluster_cols, x_cols, "Z") + return DoubleMLData( + data, y_col="Y", d_cols="D", cluster_cols=cluster_cols, x_cols=x_cols, z_cols="Z", is_cluster_data=True + ) else: raise ValueError("Invalid return_type.") From cb1168484015670f07bdba59344e716964d7e995 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 12 Jun 2025 15:17:47 +0200 Subject: [PATCH 58/84] update unit tests --- doubleml/data/tests/test_cluster_data.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doubleml/data/tests/test_cluster_data.py b/doubleml/data/tests/test_cluster_data.py index 91627158..a2cd726f 100644 --- a/doubleml/data/tests/test_cluster_data.py +++ b/doubleml/data/tests/test_cluster_data.py @@ -97,7 +97,7 @@ def test_cluster_cols_setter(): with pytest.raises(ValueError, match=msg): dml_data.cluster_cols = "X13" - msg = r"The cluster variable\(s\) cluster_cols must be of str or list type (or None). " "5 of type was passed." + msg = r"The cluster variable\(s\) cluster_cols must be of str or list type \(or None\)\. " "5 of type was passed." with pytest.raises(TypeError, match=msg): dml_data.cluster_cols = 5 @@ -161,7 +161,7 @@ def test_duplicates(): msg = r"Invalid cluster variable\(s\) cluster_cols: Contains duplicate values." with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(dml_cluster_data.data, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2", "X3"]) + _ = DoubleMLData(dml_cluster_data.data, y_col="Y", d_cols=["D"], cluster_cols=["X3", "X2", "X3"], is_cluster_data=True) with pytest.raises(ValueError, match=msg): dml_cluster_data.cluster_cols = ["X3", "X2", "X3"] @@ -215,5 +215,4 @@ def test_cluster_data_str(): ) dml_str_optional = str(dml_data_with_optional) - assert "Time variable: time_var" in dml_str_optional - assert "Score/Selection variable: score_var" in dml_str_optional + assert "Time variable: time_var" in dml_str_optional \ No newline at end of file From 3fe83ff6683cd6d609cb841dda453ce24a7a2d5c Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Thu, 12 Jun 2025 15:21:07 +0200 Subject: [PATCH 59/84] align subset naming in did binary and cs version --- doubleml/did/did_binary.py | 32 +++++++++---------- doubleml/did/did_cs_binary.py | 26 +++++++-------- .../test_did_binary_external_predictions.py | 4 +-- ..._binary_external_predictions_unbalanced.py | 2 +- .../did/tests/test_did_binary_vs_did_panel.py | 2 +- ...test_did_cs_binary_external_predictions.py | 2 +- doubleml/did/tests/test_return_types.py | 12 +++---- 7 files changed, 40 insertions(+), 40 deletions(-) diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py index 6fa19e0d..99e18e28 100644 --- a/doubleml/did/did_binary.py +++ b/doubleml/did/did_binary.py @@ -163,10 +163,10 @@ def __init__( # Preprocess data # Y1, Y0 might be needed if we want to support custom estimators and scores; currently only output y_diff - self._panel_data_wide = self._preprocess_data(self._g_value, self._t_value_pre, self._t_value_eval) + self._data_subset = self._preprocess_data(self._g_value, self._t_value_pre, self._t_value_eval) # Handling id values to match pairwise evaluation & simultaneous inference - id_panel_data = self._panel_data_wide[self._dml_data.id_col].values + id_panel_data = self._data_subset[self._dml_data.id_col].values id_original = self._dml_data.id_var_unique if not np.all(np.isin(id_panel_data, id_original)): raise ValueError("The id values in the panel data are not a subset of the original id values.") @@ -177,13 +177,13 @@ def __init__( # Numeric values for positions of the entries in id_panel_data inside id_original # np.nonzero(np.isin(id_original, id_panel_data)) - self._n_obs_subset = self._panel_data_wide.shape[0] # Effective sample size used for resampling - self._n_treated_subset = self._panel_data_wide["G_indicator"].sum() + self._n_obs_subset = self._data_subset.shape[0] # Effective sample size used for resampling + self._n_treated_subset = self._data_subset["G_indicator"].sum() # Save x and y for later ML estimation - self._x_panel = self._panel_data_wide.loc[:, self._dml_data.x_cols].values - self._y_panel = self._panel_data_wide.loc[:, "y_diff"].values - self._g_panel = self._panel_data_wide.loc[:, "G_indicator"].values + self._x_data_subset = self._data_subset.loc[:, self._dml_data.x_cols].values + self._y_data_subset = self._data_subset.loc[:, "y_diff"].values + self._g_data_subset = self._data_subset.loc[:, "G_indicator"].values valid_scores = ["observational", "experimental"] _check_score(self.score, valid_scores, allow_callable=False) @@ -196,7 +196,7 @@ def __init__( ) # set stratication for resampling - self._strata = self._panel_data_wide["G_indicator"] + self._strata = self._data_subset["G_indicator"] self._n_obs_sample_splitting = self.n_obs_subset if draw_sample_splitting: self.draw_sample_splitting() @@ -342,11 +342,11 @@ def anticipation_periods(self): return self._anticipation_periods @property - def panel_data_wide(self): + def data_subset(self): """ The preprocessed panel data in wide format. """ - return self._panel_data_wide + return self._data_subset @property def id_positions(self): @@ -470,8 +470,8 @@ def _preprocess_data(self, g_value, pre_t, eval_t): def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): # Here: d is a binary treatment indicator - x, y = check_X_y(self._x_panel, self._y_panel, force_all_finite=False) - x, d = check_X_y(x, self._g_panel, force_all_finite=False) + x, y = check_X_y(self._x_data_subset, self._y_data_subset, force_all_finite=False) + x, d = check_X_y(x, self._g_data_subset, force_all_finite=False) # nuisance g # get train indices for d == 0 smpls_d0, smpls_d1 = _get_cond_smpls(smpls, d) @@ -611,8 +611,8 @@ def _score_elements(self, y, d, g_hat0, g_hat1, m_hat, p_hat): def _nuisance_tuning( self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search ): - x, y = check_X_y(self._x_panel, self._y_panel, force_all_finite=False) - x, d = check_X_y(x, self._g_panel, force_all_finite=False) + x, y = check_X_y(self._x_data_subset, self._y_data_subset, force_all_finite=False) + x, d = check_X_y(x, self._g_data_subset, force_all_finite=False) # get train indices for d == 0 and d == 1 smpls_d0, smpls_d1 = _get_cond_smpls(smpls, d) @@ -676,8 +676,8 @@ def _nuisance_tuning( return res def _sensitivity_element_est(self, preds): - y = self._y_panel - d = self._g_panel + y = self._y_data_subset + d = self._g_data_subset m_hat = _get_id_positions(preds["predictions"]["ml_m"], self.id_positions) g_hat0 = _get_id_positions(preds["predictions"]["ml_g0"], self.id_positions) diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index 7788f4b3..9e5ee6c2 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -97,10 +97,10 @@ def __init__( self._n_obs_subset = self.data_subset.shape[0] # Effective sample size used for resampling # Save x and y for later ML estimation - self._x_data = self.data_subset.loc[:, self._dml_data.x_cols].values - self._y_data = self.data_subset.loc[:, self._dml_data.y_col].values - self._g_data = self.data_subset.loc[:, "G_indicator"].values - self._t_data = self.data_subset.loc[:, "t_indicator"].values + self._x_data_subset = self.data_subset.loc[:, self._dml_data.x_cols].values + self._y_data_subset = self.data_subset.loc[:, self._dml_data.y_col].values + self._g_data_subset = self.data_subset.loc[:, "G_indicator"].values + self._t_data_subset = self.data_subset.loc[:, "t_indicator"].values valid_scores = ["observational", "experimental"] _check_score(self.score, valid_scores, allow_callable=False) @@ -402,9 +402,9 @@ def _estimate_conditional_g( def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): # Here: d is a binary treatment indicator - x, y = check_X_y(X=self._x_data, y=self._y_data, force_all_finite=False) - _, d = check_X_y(x, self._g_data, force_all_finite=False) # (d is the G_indicator) - _, t = check_X_y(x, self._t_data, force_all_finite=False) + x, y = check_X_y(X=self._x_data_subset, y=self._y_data_subset, force_all_finite=False) + _, d = check_X_y(x, self._g_data_subset, force_all_finite=False) # (d is the G_indicator) + _, t = check_X_y(x, self._t_data_subset, force_all_finite=False) # THIS DIFFERS FROM THE PAPER due to stratified splitting this should be the same for each fold # nuisance estimates of the uncond. treatment prob. @@ -588,9 +588,9 @@ def _score_elements(self, y, d, t, g_hat_d0_t0, g_hat_d0_t1, g_hat_d1_t0, g_hat_ def _nuisance_tuning( self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search ): - x, y = check_X_y(X=self._x_data, y=self._y_data, force_all_finite=False) - _, d = check_X_y(x, self._g_data, force_all_finite=False) # (d is the G_indicator) - _, t = check_X_y(x, self._t_data, force_all_finite=False) + x, y = check_X_y(X=self._x_data_subset, y=self._y_data_subset, force_all_finite=False) + _, d = check_X_y(x, self._g_data_subset, force_all_finite=False) # (d is the G_indicator) + _, t = check_X_y(x, self._t_data_subset, force_all_finite=False) if scoring_methods is None: scoring_methods = {"ml_g": None, "ml_m": None} @@ -702,9 +702,9 @@ def _nuisance_tuning( return res def _sensitivity_element_est(self, preds): - y = self._y_data - d = self._g_data - t = self._t_data + y = self._y_data_subset + d = self._g_data_subset + t = self._t_data_subset m_hat = _get_id_positions(preds["predictions"]["ml_m"], self.id_positions) g_hat_d0_t0 = _get_id_positions(preds["predictions"]["ml_g_d0_t0"], self.id_positions) diff --git a/doubleml/did/tests/test_did_binary_external_predictions.py b/doubleml/did/tests/test_did_binary_external_predictions.py index 0cb3e055..0a6cf2f0 100644 --- a/doubleml/did/tests/test_did_binary_external_predictions.py +++ b/doubleml/did/tests/test_did_binary_external_predictions.py @@ -40,7 +40,7 @@ def doubleml_did_fixture(did_score, n_rep): } dml_did = DoubleMLDIDBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs) - all_smpls = draw_smpls(n_obs, n_folds, n_rep=n_rep, groups=dml_did._g_panel) + all_smpls = draw_smpls(n_obs, n_folds, n_rep=n_rep, groups=dml_did._g_data_subset) dml_did.set_sample_splitting(all_smpls) np.random.seed(3141) @@ -112,7 +112,7 @@ def doubleml_did_panel_fixture(did_score, n_rep): } dml_did = DoubleMLDIDBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs) - all_smpls = draw_smpls(n_obs=dml_did.n_obs_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_panel) + all_smpls = draw_smpls(n_obs=dml_did.n_obs_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_data_subset) dml_did.set_sample_splitting(all_smpls) np.random.seed(3141) diff --git a/doubleml/did/tests/test_did_binary_external_predictions_unbalanced.py b/doubleml/did/tests/test_did_binary_external_predictions_unbalanced.py index ffeadb51..a921efee 100644 --- a/doubleml/did/tests/test_did_binary_external_predictions_unbalanced.py +++ b/doubleml/did/tests/test_did_binary_external_predictions_unbalanced.py @@ -42,7 +42,7 @@ def doubleml_did_panel_fixture(did_score, n_rep): } dml_did = DoubleMLDIDBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs) - all_smpls = draw_smpls(n_obs=dml_did.n_obs_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_panel) + all_smpls = draw_smpls(n_obs=dml_did.n_obs_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_data_subset) dml_did.set_sample_splitting(all_smpls) np.random.seed(3141) diff --git a/doubleml/did/tests/test_did_binary_vs_did_panel.py b/doubleml/did/tests/test_did_binary_vs_did_panel.py index 9da81739..426b413c 100644 --- a/doubleml/did/tests/test_did_binary_vs_did_panel.py +++ b/doubleml/did/tests/test_did_binary_vs_did_panel.py @@ -78,7 +78,7 @@ def dml_did_binary_vs_did_fixture(time_type, learner, score, in_sample_normaliza ) dml_did_binary_obj.fit() - df_wide = dml_did_binary_obj._panel_data_wide.copy() + df_wide = dml_did_binary_obj.data_subset.copy() dml_data = dml.data.DoubleMLData(df_wide, y_col="y_diff", d_cols="G_indicator", x_cols=["Z1", "Z2", "Z3", "Z4"]) dml_did_obj = dml.DoubleMLDID( dml_data, diff --git a/doubleml/did/tests/test_did_cs_binary_external_predictions.py b/doubleml/did/tests/test_did_cs_binary_external_predictions.py index 477c6dc7..f6b77f0b 100644 --- a/doubleml/did/tests/test_did_cs_binary_external_predictions.py +++ b/doubleml/did/tests/test_did_cs_binary_external_predictions.py @@ -114,7 +114,7 @@ def doubleml_did_cs_panel_fixture(did_score, n_rep): } dml_did = DoubleMLDIDCSBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs) - all_smpls = draw_smpls(n_obs=dml_did.n_obs_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_data) + all_smpls = draw_smpls(n_obs=dml_did.n_obs_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_data_subset) dml_did.set_sample_splitting(all_smpls) np.random.seed(3141) diff --git a/doubleml/did/tests/test_return_types.py b/doubleml/did/tests/test_return_types.py index 1b6fa736..683b1dc1 100644 --- a/doubleml/did/tests/test_return_types.py +++ b/doubleml/did/tests/test_return_types.py @@ -122,12 +122,12 @@ def test_panel_return_types(dml_obj, cls): assert isinstance(dml_obj.t_value_pre, (int, np.integer, float, np.floating)) assert isinstance(dml_obj.post_treatment, bool) - # Test panel_data_wide property - assert isinstance(dml_obj.panel_data_wide, pd.DataFrame) - assert dml_obj.panel_data_wide.shape[0] <= N_OBS - assert "G_indicator" in dml_obj.panel_data_wide.columns - assert "C_indicator" in dml_obj.panel_data_wide.columns - assert "y_diff" in dml_obj.panel_data_wide.columns + # Test data_subset property + assert isinstance(dml_obj.data_subset, pd.DataFrame) + assert dml_obj.data_subset.shape[0] <= N_OBS + assert "G_indicator" in dml_obj.data_subset.columns + assert "C_indicator" in dml_obj.data_subset.columns + assert "y_diff" in dml_obj.data_subset.columns # Test id_positions property assert isinstance(dml_obj.id_positions, np.ndarray) From 1eec50ced37a629425dcd66994839412d7d1f6d6 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 12 Jun 2025 16:17:50 +0200 Subject: [PATCH 60/84] fix panel data backend / unit tests --- doubleml/data/panel_data.py | 15 ++++++++++----- doubleml/data/tests/test_panel_data.py | 2 +- doubleml/data/tests/test_panel_data_exceptions.py | 2 +- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py index c1ec3bb5..a3651756 100644 --- a/doubleml/data/panel_data.py +++ b/doubleml/data/panel_data.py @@ -157,8 +157,9 @@ def datetime_unit(self): """ The unit of the time variable. """ - return self._datetime_unit @ property + return self._datetime_unit + @property def d(self): """ Array of treatment variable; @@ -228,9 +229,13 @@ def g_col(self): """ The treatment variable indicating the time of treatment exposure. """ - return self._d_cols[0] @ DoubleMLData.d_cols.setter + return self._d_cols[0] + + @ DoubleMLData.d_cols.setter def d_cols(self, value): + if isinstance(value, str): + value = [value] super(self.__class__, self.__class__).d_cols.__set__(self, value) if hasattr(self, "_g_values"): self._g_values = np.sort(np.unique(self.d)) # update unique values of g @@ -266,7 +271,7 @@ def t_col(self, value): ) # Check if data exists (during initialization it might not) if hasattr(self, "_data") and value not in self.all_variables: - raise ValueError("Invalid time variable t_col. The time variable is no data column.") + raise ValueError(f"Invalid time variable t_col. {value} is no data column.") self._t_col = value # Update time variable array if data is already loaded if hasattr(self, "_data"): @@ -301,8 +306,8 @@ def _check_disjoint_sets(self): self._check_disjoint_sets_t_col() def _check_disjoint_sets_id_col(self): - # apply the standard checks from the DoubleMLData class - super(DoubleMLPanelData, self)._check_disjoint_sets() + # The call to super()._check_disjoint_sets() is removed from here as it's redundant + # and called in the main _check_disjoint_sets method of this class. # special checks for the additional id variable (and the time variable) id_col_set = {self.id_col} diff --git a/doubleml/data/tests/test_panel_data.py b/doubleml/data/tests/test_panel_data.py index 2f2250ba..a9ea0ea2 100644 --- a/doubleml/data/tests/test_panel_data.py +++ b/doubleml/data/tests/test_panel_data.py @@ -33,7 +33,7 @@ def test_t_col_setter(): with pytest.raises(ValueError, match=msg): dml_data.t_col = "a13" - msg = r"The time variable t_col must be of str type \(or None\). " "5 of type was passed." + msg = r"The time variable t_col must be of str type. " "5 of type was passed." with pytest.raises(TypeError, match=msg): dml_data.t_col = 5 diff --git a/doubleml/data/tests/test_panel_data_exceptions.py b/doubleml/data/tests/test_panel_data_exceptions.py index fab648fe..7480bce1 100644 --- a/doubleml/data/tests/test_panel_data_exceptions.py +++ b/doubleml/data/tests/test_panel_data_exceptions.py @@ -109,5 +109,5 @@ def test_invalid_datetime_unit(sample_data): # test if no exception is raised @pytest.mark.ci def test_no_exception(sample_data): - DoubleMLPanelData(data=sample_data, y_col="y", d_cols="treatment", t_col="time", id_col="id") + DoubleMLPanelData(data=sample_data, y_col="y", d_cols=["treatment"], t_col="time", id_col="id") assert True From d71dff605fa55cbbda4e75a8cea0ab4298e7b3b7 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 12 Jun 2025 16:18:04 +0200 Subject: [PATCH 61/84] fix did data backend / unit tests --- doubleml/did/datasets/dgp_did_SZ2020.py | 8 ++++---- doubleml/did/tests/test_datasets.py | 6 +++--- doubleml/did/tests/test_did_cs_external_predictions.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doubleml/did/datasets/dgp_did_SZ2020.py b/doubleml/did/datasets/dgp_did_SZ2020.py index db82b032..eb150bbf 100644 --- a/doubleml/did/datasets/dgp_did_SZ2020.py +++ b/doubleml/did/datasets/dgp_did_SZ2020.py @@ -189,7 +189,7 @@ def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_ty if return_type in _data_frame_alias: return data else: - return DoubleMLDIDData(data, "y", "d", x_cols=z_cols) + return DoubleMLDIDData(data, y_col="y", d_cols="d", x_cols=z_cols) elif return_type == "DoubleMLPanelData": z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] df0 = ( @@ -218,7 +218,7 @@ def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_ty ) df = pd.concat([df0, df1], axis=0) - return DoubleMLPanelData(df, "y", "d", t_col="t", id_col="id", x_cols=z_cols) + return DoubleMLPanelData(df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=z_cols) else: raise ValueError("Invalid return_type.") @@ -235,6 +235,6 @@ def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_ty if return_type in _data_frame_alias: return data elif return_type in _dml_did_data_alias: - return DoubleMLDIDData(data, "y", "d", x_cols=z_cols, t_col="t") + return DoubleMLDIDData(data, y_col="y", d_cols="d", x_cols=z_cols, t_col="t") else: - raise ValueError("Invalid return_type.") \ No newline at end of file + raise ValueError("Invalid return_type.") diff --git a/doubleml/did/tests/test_datasets.py b/doubleml/did/tests/test_datasets.py index 0e323ec9..508769eb 100644 --- a/doubleml/did/tests/test_datasets.py +++ b/doubleml/did/tests/test_datasets.py @@ -2,7 +2,7 @@ import pandas as pd import pytest -from doubleml import DoubleMLData +from doubleml import DoubleMLDIDData from doubleml.did.datasets import make_did_CS2021, make_did_SZ2020 msg_inv_return_type = "Invalid return_type." @@ -21,8 +21,8 @@ def dgp_type(request): @pytest.mark.ci def test_make_did_SZ2020_return_types(cross_sectional, dgp_type): np.random.seed(3141) - res = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=DoubleMLData) - assert isinstance(res, DoubleMLData) + res = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=DoubleMLDIDData) + assert isinstance(res, DoubleMLDIDData) res = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=pd.DataFrame) assert isinstance(res, pd.DataFrame) if cross_sectional: diff --git a/doubleml/did/tests/test_did_cs_external_predictions.py b/doubleml/did/tests/test_did_cs_external_predictions.py index 2b28ac8a..1c5f6640 100644 --- a/doubleml/did/tests/test_did_cs_external_predictions.py +++ b/doubleml/did/tests/test_did_cs_external_predictions.py @@ -24,7 +24,7 @@ def n_rep(request): @pytest.fixture(scope="module") def doubleml_didcs_fixture(did_score, n_rep): ext_predictions = {"d": {}} - dml_data = make_did_SZ2020(n_obs=500, cross_sectional_data=True, return_type="DoubleMLData") + dml_data = make_did_SZ2020(n_obs=500, cross_sectional_data=True, return_type="DoubleMLDIDData") all_smpls = draw_smpls(len(dml_data.y), 5, n_rep=n_rep, groups=dml_data.d) kwargs = {"obj_dml_data": dml_data, "score": did_score, "n_rep": n_rep, "n_folds": 5, "draw_sample_splitting": False} dml_did_cs = DoubleMLDIDCS(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs) From 74ef476768d50fc21650b21a30011374f1f43f3b Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 12 Jun 2025 16:35:51 +0200 Subject: [PATCH 62/84] add depr. warning with version --- doubleml/data/__init__.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/doubleml/data/__init__.py b/doubleml/data/__init__.py index 7d368b76..0462c763 100644 --- a/doubleml/data/__init__.py +++ b/doubleml/data/__init__.py @@ -2,7 +2,6 @@ The :mod:`doubleml.data` module implements data classes for double machine learning. """ -from .base_data import DoubleMLData import warnings from .base_data import DoubleMLData @@ -33,7 +32,8 @@ def __init__( force_all_x_finite=True, ): warnings.warn( - "DoubleMLClusterData is deprecated. " "Use DoubleMLData with is_cluster_data=True instead.", + "DoubleMLClusterData is deprecated and will be removed with version 0.12.0. " + "Use DoubleMLData with is_cluster_data=True instead.", FutureWarning, stacklevel=2, ) @@ -56,10 +56,12 @@ def from_arrays( ): """ Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s. - This method is deprecated, use DoubleMLData.from_arrays with is_cluster_data=True instead. + This method is deprecated and will be removed with version 0.12.0, + use DoubleMLData.from_arrays with is_cluster_data=True instead. """ warnings.warn( - "DoubleMLClusterData is deprecated. " "Use DoubleMLData.from_arrays with is_cluster_data=True instead.", + "DoubleMLClusterData is deprecated and will be removed with version 0.12.0. " + "Use DoubleMLData.from_arrays with is_cluster_data=True instead.", FutureWarning, stacklevel=2, ) From e7a9f5c75e0fd7fc5aff78998012052e7e993f51 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Thu, 12 Jun 2025 16:39:36 +0200 Subject: [PATCH 63/84] update return type tests for did cs binary --- doubleml/did/did_cs_binary.py | 2 +- doubleml/did/tests/test_return_types.py | 58 +++++++++++++++++++++---- 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index 9e5ee6c2..a6005d53 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -90,7 +90,7 @@ def __init__( # Find position of data subset in original data # These entries should be replaced by nuisance predictions, all others should be set to 0. - self._id_positions = self.data_subset.index + self._id_positions = self.data_subset.index.values # Numeric values for positions of the entries in id_panel_data inside id_original # np.nonzero(np.isin(id_original, id_panel_data)) diff --git a/doubleml/did/tests/test_return_types.py b/doubleml/did/tests/test_return_types.py index 683b1dc1..37105c3e 100644 --- a/doubleml/did/tests/test_return_types.py +++ b/doubleml/did/tests/test_return_types.py @@ -4,8 +4,8 @@ from sklearn.linear_model import Lasso, LogisticRegression from doubleml.data import DoubleMLData, DoubleMLPanelData -from doubleml.did import DoubleMLDID, DoubleMLDIDBinary, DoubleMLDIDCS -from doubleml.did.datasets import make_did_CS2021, make_did_SZ2020 +from doubleml.did import DoubleMLDID, DoubleMLDIDBinary, DoubleMLDIDCS, DoubleMLDIDCSBinary +from doubleml.did.datasets import make_did_CS2021, make_did_cs_CS2021, make_did_SZ2020 from doubleml.utils._check_return_types import ( check_basic_predictions_and_targets, check_basic_property_types_and_shapes, @@ -89,6 +89,17 @@ def test_sensitivity_return_types(fitted_dml_obj): df_panel, y_col="y_binary", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] ) +# Create a dataset for DoubleMLDIDCSBinary +df_panel_cs = make_did_cs_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=N_PERIODS, time_type="float") +df_panel_cs["y_binary"] = np.random.binomial(n=1, p=0.5, size=df_panel_cs.shape[0]) +datasets["did_panel_cs"] = DoubleMLPanelData( + df_panel_cs, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] +) +datasets["did_panel_cs_binary_outcome"] = DoubleMLPanelData( + df_panel_cs, y_col="y_binary", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] +) + + dml_panel_binary_args = dml_args | { "g_value": 2, "t_value_pre": 0, @@ -106,6 +117,19 @@ def test_sensitivity_return_types(fitted_dml_obj): ), DoubleMLDIDBinary, ), + ( + DoubleMLDIDCSBinary(datasets["did_panel_cs"], ml_g=Lasso(), ml_m=LogisticRegression(), **dml_panel_binary_args), + DoubleMLDIDCSBinary, + ), + ( + DoubleMLDIDCSBinary( + datasets["did_panel_cs_binary_outcome"], + ml_g=LogisticRegression(), + ml_m=LogisticRegression(), + **dml_panel_binary_args, + ), + DoubleMLDIDCSBinary, + ), ] @@ -124,10 +148,14 @@ def test_panel_return_types(dml_obj, cls): # Test data_subset property assert isinstance(dml_obj.data_subset, pd.DataFrame) - assert dml_obj.data_subset.shape[0] <= N_OBS + if isinstance(dml_obj, DoubleMLDIDBinary): + assert dml_obj.data_subset.shape[0] <= N_OBS + assert "y_diff" in dml_obj.data_subset.columns + elif isinstance(dml_obj, DoubleMLDIDCSBinary): + assert dml_obj.data_subset.shape[0] <= N_OBS * 2 + assert "t_indicator" in dml_obj.data_subset.columns assert "G_indicator" in dml_obj.data_subset.columns assert "C_indicator" in dml_obj.data_subset.columns - assert "y_diff" in dml_obj.data_subset.columns # Test id_positions property assert isinstance(dml_obj.id_positions, np.ndarray) @@ -142,7 +170,10 @@ def test_panel_return_types(dml_obj, cls): # Test n_obs property assert isinstance(dml_obj.n_obs, (int, np.integer)) - assert dml_obj.n_obs <= N_OBS + if isinstance(dml_obj, DoubleMLDIDBinary): + assert dml_obj.n_obs <= N_OBS + elif isinstance(dml_obj, DoubleMLDIDCSBinary): + assert dml_obj.n_obs <= N_OBS * N_PERIODS # Test consistency between properties if dml_obj.post_treatment: @@ -161,20 +192,29 @@ def fitted_panel_dml_obj(request): @pytest.mark.ci def test_panel_property_types_and_shapes(fitted_panel_dml_obj): + # n_obs for psi, psi_a, psi_b checks within check_basic_property_types_and_shapes + # This should be the number of observations used for the score calculation. + # For DIDBinary, it's n_ids. For DIDCSBinary, it's _n_obs_subset. + # Both are consistently available as fitted_panel_dml_obj.n_obs. + actual_score_dim = (fitted_panel_dml_obj.n_obs, N_REP, N_TREAT) + check_basic_property_types_and_shapes( fitted_panel_dml_obj, - n_obs=N_PERIODS * N_OBS, + n_obs=fitted_panel_dml_obj._dml_data.n_obs, n_treat=N_TREAT, n_rep=N_REP, n_folds=N_FOLDS, n_rep_boot=N_REP_BOOT, - score_dim=(N_OBS, N_REP, N_TREAT), + score_dim=actual_score_dim, # Used for psi shape ) - check_basic_predictions_and_targets(fitted_panel_dml_obj, N_OBS, N_TREAT, N_REP) + + check_basic_predictions_and_targets(fitted_panel_dml_obj, fitted_panel_dml_obj.n_obs, N_TREAT, N_REP) @pytest.mark.ci def test_panel_sensitivity_return_types(fitted_panel_dml_obj): if fitted_panel_dml_obj._sensitivity_implemented: benchmarking_set = [fitted_panel_dml_obj._dml_data.x_cols[0]] - check_sensitivity_return_types(fitted_panel_dml_obj, N_OBS, N_REP, N_TREAT, benchmarking_set=benchmarking_set) + check_sensitivity_return_types( + fitted_panel_dml_obj, fitted_panel_dml_obj.n_obs, N_REP, N_TREAT, benchmarking_set=benchmarking_set + ) From bba51605df0716294bef50a87ce557904e46c4e7 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 12 Jun 2025 17:05:54 +0200 Subject: [PATCH 64/84] adjust unit tests for ssm --- doubleml/irm/tests/test_ssm.py | 4 ++-- doubleml/irm/tests/test_ssm_tune.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doubleml/irm/tests/test_ssm.py b/doubleml/irm/tests/test_ssm.py index b157794b..c561d9fe 100644 --- a/doubleml/irm/tests/test_ssm.py +++ b/doubleml/irm/tests/test_ssm.py @@ -54,11 +54,11 @@ def dml_selection_fixture( np.random.seed(42) if score == "missing-at-random": - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d, z=None, s=s) + obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=None, s=s) dml_sel_obj = dml.DoubleMLSSM(obj_dml_data, ml_g, ml_pi, ml_m, n_folds=n_folds, score=score) else: assert score == "nonignorable" - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d, z=z, s=s) + obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=z, s=s) dml_sel_obj = dml.DoubleMLSSM(obj_dml_data, ml_g, ml_pi, ml_m, n_folds=n_folds, score=score) np.random.seed(42) diff --git a/doubleml/irm/tests/test_ssm_tune.py b/doubleml/irm/tests/test_ssm_tune.py index 0fafbc13..4e48bec3 100644 --- a/doubleml/irm/tests/test_ssm_tune.py +++ b/doubleml/irm/tests/test_ssm_tune.py @@ -76,7 +76,7 @@ def dml_ssm_fixture( np.random.seed(42) if score == "missing-at-random": - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d, z=None, s=s) + obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=None, s=s) dml_sel_obj = dml.DoubleMLSSM( obj_dml_data, ml_g, @@ -89,7 +89,7 @@ def dml_ssm_fixture( ) else: assert score == "nonignorable" - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d, z=z, s=s) + obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=z, s=s) dml_sel_obj = dml.DoubleMLSSM( obj_dml_data, ml_g, From 96ebd03efa3674506dd859c2eec306cacd6aa36b Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 12 Jun 2025 17:06:06 +0200 Subject: [PATCH 65/84] adjust unit tests for did --- doubleml/did/tests/test_did_cs_tune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/did/tests/test_did_cs_tune.py b/doubleml/did/tests/test_did_cs_tune.py index 5ec33e82..50415937 100644 --- a/doubleml/did/tests/test_did_cs_tune.py +++ b/doubleml/did/tests/test_did_cs_tune.py @@ -67,7 +67,7 @@ def dml_did_cs_fixture(generate_data_did_cs, learner_g, learner_m, score, in_sam all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d + 2 * t) np.random.seed(3141) - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d, t=t) + obj_dml_data = dml.DoubleMLDIDData.from_arrays(x, y, d, t=t) dml_did_cs_obj = dml.DoubleMLDIDCS( obj_dml_data, ml_g, From a1686d5c7b4b2d51287f19c3aa2557d866b4e798 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 12 Jun 2025 17:06:17 +0200 Subject: [PATCH 66/84] adjust unit tests general --- doubleml/tests/test_datasets.py | 2 +- doubleml/tests/test_exceptions.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py index 8f1c4f03..aa1d9433 100644 --- a/doubleml/tests/test_datasets.py +++ b/doubleml/tests/test_datasets.py @@ -151,7 +151,7 @@ def test_make_pliv_CHS2015_return_types(): @pytest.mark.ci def test_make_pliv_multiway_cluster_CKMS2021_return_types(): np.random.seed(3141) - res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DoubleMLClusterData") + res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DoubleMLData") assert isinstance(res, DoubleMLClusterData) res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DataFrame") assert isinstance(res, pd.DataFrame) diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index d8fe4e7c..5178adc6 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -8,7 +8,7 @@ from doubleml import ( DoubleMLBLP, - DoubleMLClusterData, + DoubleMLDIDData, DoubleMLCVAR, DoubleMLData, DoubleMLDID, @@ -265,11 +265,11 @@ def test_doubleml_exception_data(): df_did_cs["d"] = df_did_cs["d"] * 2 with pytest.raises(ValueError, match=msg): # non-binary D for DIDCS - _ = DoubleMLDIDCS(DoubleMLData(df_did_cs, y_col="y", d_cols="d", t_col="t"), Lasso(), LogisticRegression()) + _ = DoubleMLDIDCS(DoubleMLDIDData(df_did_cs, y_col="y", d_cols="d", t_col="t"), Lasso(), LogisticRegression()) df_did_cs = dml_data_did_cs.data.copy() with pytest.raises(ValueError, match=msg): # multiple D for DIDCS - _ = DoubleMLDIDCS(DoubleMLData(df_did_cs, y_col="y", d_cols=["d", "Z1"], t_col="t"), Lasso(), LogisticRegression()) + _ = DoubleMLDIDCS(DoubleMLDIDData(df_did_cs, y_col="y", d_cols=["d", "Z1"], t_col="t"), Lasso(), LogisticRegression()) # DIDCS time exceptions msg = ( @@ -280,7 +280,7 @@ def test_doubleml_exception_data(): df_did_cs["t"] = df_did_cs["t"] * 2 with pytest.raises(ValueError, match=msg): # non-binary t for DIDCS - _ = DoubleMLDIDCS(DoubleMLData(df_did_cs, y_col="y", d_cols="d", t_col="t"), Lasso(), LogisticRegression()) + _ = DoubleMLDIDCS(DoubleMLDIDData(df_did_cs, y_col="y", d_cols="d", t_col="t"), Lasso(), LogisticRegression()) @pytest.mark.ci From 756092c9ba6b2831d398f2aa79024b2c7eed3b78 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Thu, 12 Jun 2025 17:11:47 +0200 Subject: [PATCH 67/84] adjust unit tests general --- doubleml/tests/test_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py index aa1d9433..b31091a6 100644 --- a/doubleml/tests/test_datasets.py +++ b/doubleml/tests/test_datasets.py @@ -152,7 +152,7 @@ def test_make_pliv_CHS2015_return_types(): def test_make_pliv_multiway_cluster_CKMS2021_return_types(): np.random.seed(3141) res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DoubleMLData") - assert isinstance(res, DoubleMLClusterData) + assert isinstance(res, DoubleMLData) res = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="DataFrame") assert isinstance(res, pd.DataFrame) x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="array") From 6bac76e99959d092125abced8caa32ee44bd7c5a Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 13 Jun 2025 08:34:27 +0200 Subject: [PATCH 68/84] enhance did_multi plotting with anticipation periods and update color palette handling --- doubleml/data/utils/panel_data_utils.py | 50 +++++++++ doubleml/did/did_multi.py | 123 +++++++++++++--------- doubleml/did/tests/test_did_multi_plot.py | 2 +- 3 files changed, 123 insertions(+), 52 deletions(-) diff --git a/doubleml/data/utils/panel_data_utils.py b/doubleml/data/utils/panel_data_utils.py index abd365eb..cc94d39f 100644 --- a/doubleml/data/utils/panel_data_utils.py +++ b/doubleml/data/utils/panel_data_utils.py @@ -1,8 +1,58 @@ +import pandas as pd + valid_datetime_units = {"Y", "M", "D", "h", "m", "s", "ms", "us", "ns"} +# Units that can be used with pd.Timedelta (unambiguous) +timedelta_compatible_units = {"D", "h", "m", "s", "ms", "us", "ns"} + +# Units that require period arithmetic (ambiguous) +period_only_units = {"Y", "M"} + def _is_valid_datetime_unit(unit): if unit not in valid_datetime_units: raise ValueError("Invalid datetime unit.") else: return unit + + +def _is_timedelta_compatible(unit): + """Check if a datetime unit can be used with pd.Timedelta.""" + return unit in timedelta_compatible_units + + +def _subtract_periods_safe(datetime_values, reference_datetime, periods, unit): + """ + Safely subtract periods from datetime values, handling both timedelta-compatible + and period-only units. + + Parameters + ---------- + datetime_values : pandas.Series or numpy.array + Array of datetime values to compare + reference_datetime : datetime-like + Reference datetime to subtract periods from + periods : int + Number of periods to subtract + unit : str + Datetime unit + + Returns + ------- + numpy.array + Boolean array indicating which datetime_values are >= (reference_datetime - periods) + """ + if periods == 0: + # No anticipation periods, so no datetime arithmetic needed + return datetime_values >= reference_datetime + + if _is_timedelta_compatible(unit): + # Use Timedelta for unambiguous units + period_offset = pd.Timedelta(periods, unit=unit) + return datetime_values >= (reference_datetime - period_offset) + else: + # Use Period arithmetic for ambiguous units like 'M' and 'Y' + ref_period = pd.Period(reference_datetime, freq=unit) + ref_minus_periods = ref_period - periods + datetime_periods = pd.PeriodIndex(datetime_values, freq=unit) + return datetime_periods >= ref_minus_periods diff --git a/doubleml/did/did_multi.py b/doubleml/did/did_multi.py index 646ad41d..cdfe0756 100644 --- a/doubleml/did/did_multi.py +++ b/doubleml/did/did_multi.py @@ -10,6 +10,7 @@ from sklearn.base import clone from doubleml.data import DoubleMLPanelData +from doubleml.data.utils.panel_data_utils import _subtract_periods_safe from doubleml.did.did_aggregation import DoubleMLDIDAggregation from doubleml.did.did_binary import DoubleMLDIDBinary from doubleml.did.did_cs_binary import DoubleMLDIDCSBinary @@ -989,8 +990,9 @@ def plot_effects( first_treated_periods = sorted(df["First Treated"].unique()) n_periods = len(first_treated_periods) - # Set up colors - colors = dict(zip(["pre", "post"], sns.color_palette(color_palette)[:2])) + # Set up colors - ensure 'post' always gets the second color + palette_colors = sns.color_palette(color_palette) + colors = {"pre": palette_colors[0], "post": palette_colors[1], "anticipation": palette_colors[2]} # Check if x-axis is datetime or convert to float is_datetime = pd.api.types.is_datetime64_any_dtype(df["Evaluation Period"]) @@ -1034,9 +1036,20 @@ def plot_effects( Line2D([0], [0], color="red", linestyle=":", alpha=0.7, label="Treatment start"), Line2D([0], [0], color="black", linestyle="--", alpha=0.5, label="Zero effect"), Line2D([0], [0], marker="o", color=colors["pre"], linestyle="None", label="Pre-treatment", markersize=5), - Line2D([0], [0], marker="o", color=colors["post"], linestyle="None", label="Post-treatment", markersize=5), ] - legend_ax.legend(handles=legend_elements, loc="center", ncol=4, mode="expand", borderaxespad=0.0) + + if self.anticipation_periods > 0: + legend_elements.append( + Line2D( + [0], [0], marker="o", color=colors["anticipation"], linestyle="None", label="Anticipation", markersize=5 + ) + ) + + legend_elements.append( + Line2D([0], [0], marker="o", color=colors["post"], linestyle="None", label="Post-treatment", markersize=5) + ) + + legend_ax.legend(handles=legend_elements, loc="center", ncol=len(legend_elements), mode="expand", borderaxespad=0.0) # Set title and layout plt.suptitle(title, y=1.02) @@ -1057,7 +1070,7 @@ def _plot_single_group(self, ax, period_df, period, colors, is_datetime, jitter_ period : int or datetime Treatment period for this group. colors : dict - Dictionary with 'pre' and 'post' color values. + Dictionary with 'pre', 'anticipation' (if applicable), and 'post' color values. is_datetime : bool Whether the x-axis represents datetime values. jitter_value : float @@ -1074,56 +1087,64 @@ def _plot_single_group(self, ax, period_df, period, colors, is_datetime, jitter_ ax.axvline(x=period, color="red", linestyle=":", alpha=0.7) ax.axhline(y=0, color="black", linestyle="--", alpha=0.5) - # Split and jitter data - pre_treatment = add_jitter( - period_df[period_df["Pre-Treatment"]], - "Evaluation Period", - is_datetime=is_datetime, - jitter_value=jitter_value, - ) - post_treatment = add_jitter( - period_df[~period_df["Pre-Treatment"]], - "Evaluation Period", - is_datetime=is_datetime, - jitter_value=jitter_value, - ) - - # Plot pre-treatment points - if not pre_treatment.empty: - ax.scatter(pre_treatment["jittered_x"], pre_treatment["Estimate"], color=colors["pre"], alpha=0.8, s=30) - ax.errorbar( - pre_treatment["jittered_x"], - pre_treatment["Estimate"], - yerr=[ - pre_treatment["Estimate"] - pre_treatment["CI Lower"], - pre_treatment["CI Upper"] - pre_treatment["Estimate"], - ], - fmt="o", - capsize=3, - color=colors["pre"], - markersize=4, - markeredgewidth=1, - linewidth=1, + # Categorize periods + if is_datetime: + # For datetime, use safe period arithmetic that handles both timedelta-compatible and period-only units + anticipation_ge_mask = _subtract_periods_safe( + period_df["Evaluation Period"], period, self.anticipation_periods, self._dml_data.datetime_unit ) + anticipation_mask = ( + (self.anticipation_periods > 0) + & period_df["Pre-Treatment"] + & anticipation_ge_mask + & (period_df["Evaluation Period"] < period) + ) + else: + # For numeric periods, simple arithmetic works + anticipation_mask = ( + (self.anticipation_periods > 0) + & period_df["Pre-Treatment"] + & (period_df["Evaluation Period"] >= period - self.anticipation_periods) + & (period_df["Evaluation Period"] < period) + ) + + pre_treatment_mask = period_df["Pre-Treatment"] & ~anticipation_mask + post_treatment_mask = ~period_df["Pre-Treatment"] + + # Define category mappings + categories = [("pre", pre_treatment_mask), ("anticipation", anticipation_mask), ("post", post_treatment_mask)] - # Plot post-treatment points - if not post_treatment.empty: - ax.scatter(post_treatment["jittered_x"], post_treatment["Estimate"], color=colors["post"], alpha=0.8, s=30) - ax.errorbar( - post_treatment["jittered_x"], - post_treatment["Estimate"], - yerr=[ - post_treatment["Estimate"] - post_treatment["CI Lower"], - post_treatment["CI Upper"] - post_treatment["Estimate"], - ], - fmt="o", - capsize=3, - color=colors["post"], - markersize=4, - markeredgewidth=1, - linewidth=1, + # Plot each category + for category_name, mask in categories: + if not mask.any(): + continue + + category_data = add_jitter( + period_df[mask], + "Evaluation Period", + is_datetime=is_datetime, + jitter_value=jitter_value, ) + if not category_data.empty: + ax.scatter( + category_data["jittered_x"], category_data["Estimate"], color=colors[category_name], alpha=0.8, s=30 + ) + ax.errorbar( + category_data["jittered_x"], + category_data["Estimate"], + yerr=[ + category_data["Estimate"] - category_data["CI Lower"], + category_data["CI Upper"] - category_data["Estimate"], + ], + fmt="o", + capsize=3, + color=colors[category_name], + markersize=4, + markeredgewidth=1, + linewidth=1, + ) + # Format axes if is_datetime: period_str = np.datetime64(period, self._dml_data.datetime_unit) diff --git a/doubleml/did/tests/test_did_multi_plot.py b/doubleml/did/tests/test_did_multi_plot.py index bcb8b786..5bcd0aae 100644 --- a/doubleml/did/tests/test_did_multi_plot.py +++ b/doubleml/did/tests/test_did_multi_plot.py @@ -130,7 +130,7 @@ def test_plot_effects_color_palette(doubleml_did_fixture): assert isinstance(fig, plt.Figure) # Test with a custom color list - custom_colors = [(1, 0, 0), (0, 1, 0)] # Red and green + custom_colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)] # Red, Green, Blue fig, _ = dml_obj.plot_effects(color_palette=custom_colors) assert isinstance(fig, plt.Figure) From 77b1a6b53634841e90bb6f2fe848bddad2c038cd Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 13 Jun 2025 10:19:35 +0200 Subject: [PATCH 69/84] update data summary to include unique IDs count in DoubleMLPanelData --- doubleml/data/panel_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py index 59ad531c..4ba659ce 100644 --- a/doubleml/data/panel_data.py +++ b/doubleml/data/panel_data.py @@ -141,7 +141,7 @@ def _data_summary_str(self): f"Id variable: {self.id_col}\n" ) - data_summary += f"No. Observations: {self.n_obs}\n" + data_summary += f"No. Unique Ids: {self.n_ids}\n" f"No. Observations: {self.n_obs}\n" return data_summary @classmethod From e52122f348e222b06b581df323c306825e3fb108 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 16 Jun 2025 11:04:59 +0200 Subject: [PATCH 70/84] add flexible summary with multiple formats --- doubleml/did/did_binary.py | 63 ++++---------------- doubleml/did/did_cs_binary.py | 65 ++++---------------- doubleml/double_ml.py | 109 +++++++++++++++++++++++----------- doubleml/irm/iivm.py | 15 +---- 4 files changed, 100 insertions(+), 152 deletions(-) diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py index 99e18e28..99ce7ef9 100644 --- a/doubleml/did/did_binary.py +++ b/doubleml/did/did_binary.py @@ -239,58 +239,17 @@ def __init__( self._sensitivity_implemented = True self._external_predictions_implemented = True - def __str__(self): - class_name = self.__class__.__name__ - header = f"================== {class_name} Object ==================\n" - data_summary = self._dml_data._data_summary_str() - score_info = ( - f"Score function: {str(self.score)}\n" - f"Treatment group: {str(self.g_value)}\n" - f"Pre-treatment period: {str(self.t_value_pre)}\n" - f"Evaluation period: {str(self.t_value_eval)}\n" - f"Control group: {str(self.control_group)}\n" - f"Anticipation periods: {str(self.anticipation_periods)}\n" - f"Effective sample size: {str(self.n_obs_subset)}\n" - ) - learner_info = "" - for key, value in self.learner.items(): - learner_info += f"Learner {key}: {str(value)}\n" - if self.nuisance_loss is not None: - learner_info += "Out-of-sample Performance:\n" - is_classifier = [value for value in self._is_classifier.values()] - is_regressor = [not value for value in is_classifier] - if any(is_regressor): - learner_info += "Regression:\n" - for learner in [key for key, value in self._is_classifier.items() if value is False]: - learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n" - if any(is_classifier): - learner_info += "Classification:\n" - for learner in [key for key, value in self._is_classifier.items() if value is True]: - learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n" - - if self._is_cluster_data: - resampling_info = ( - f"No. folds per cluster: {self._n_folds_per_cluster}\n" - f"No. folds: {self.n_folds}\n" - f"No. repeated sample splits: {self.n_rep}\n" - ) - else: - resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n" - fit_summary = str(self.summary) - res = ( - header - + "\n------------------ Data summary ------------------\n" - + data_summary - + "\n------------------ Score & algorithm ------------------\n" - + score_info - + "\n------------------ Machine learner ------------------\n" - + learner_info - + "\n------------------ Resampling ------------------\n" - + resampling_info - + "\n------------------ Fit summary ------------------\n" - + fit_summary - ) - return res + def _format_score_info_str(self): + lines = [ + f"Score function: {str(self.score)}", + f"Treatment group: {str(self.g_value)}", + f"Pre-treatment period: {str(self.t_value_pre)}", + f"Evaluation period: {str(self.t_value_eval)}", + f"Control group: {str(self.control_group)}", + f"Anticipation periods: {str(self.anticipation_periods)}", + f"Effective sample size: {str(self.n_obs_subset)}", + ] + return "\\n".join(lines) @property def g_value(self): diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index a6005d53..73b9152f 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -156,58 +156,19 @@ def __init__( self._sensitivity_implemented = True self._external_predictions_implemented = True - def __str__(self): - class_name = self.__class__.__name__ - header = f"================== {class_name} Object ==================\n" - data_summary = self._dml_data._data_summary_str() - score_info = ( - f"Score function: {str(self.score)}\n" - f"Treatment group: {str(self.g_value)}\n" - f"Pre-treatment period: {str(self.t_value_pre)}\n" - f"Evaluation period: {str(self.t_value_eval)}\n" - f"Control group: {str(self.control_group)}\n" - f"Anticipation periods: {str(self.anticipation_periods)}\n" - f"Effective sample size: {str(self.n_obs_subset)}\n" - ) - learner_info = "" - for key, value in self.learner.items(): - learner_info += f"Learner {key}: {str(value)}\n" - if self.nuisance_loss is not None: - learner_info += "Out-of-sample Performance:\n" - is_classifier = [value for value in self._is_classifier.values()] - is_regressor = [not value for value in is_classifier] - if any(is_regressor): - learner_info += "Regression:\n" - for learner in [key for key, value in self._is_classifier.items() if value is False]: - learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n" - if any(is_classifier): - learner_info += "Classification:\n" - for learner in [key for key, value in self._is_classifier.items() if value is True]: - learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n" - - if self._is_cluster_data: - resampling_info = ( - f"No. folds per cluster: {self._n_folds_per_cluster}\n" - f"No. folds: {self.n_folds}\n" - f"No. repeated sample splits: {self.n_rep}\n" - ) - else: - resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n" - fit_summary = str(self.summary) - res = ( - header - + "\n------------------ Data summary ------------------\n" - + data_summary - + "\n------------------ Score & algorithm ------------------\n" - + score_info - + "\n------------------ Machine learner ------------------\n" - + learner_info - + "\n------------------ Resampling ------------------\n" - + resampling_info - + "\n------------------ Fit summary ------------------\n" - + fit_summary - ) - return res + def _format_score_info_str(self): + lines = [ + f"Score function: {str(self.score)}", + f"Treatment group: {str(self.g_value)}", + f"Pre-treatment period: {str(self.t_value_pre)}", + f"Evaluation period: {str(self.t_value_eval)}", + f"Control group: {str(self.control_group)}", + f"Anticipation periods: {str(self.anticipation_periods)}", + f"Effective sample size: {str(self.n_obs_subset)}", + ] + return "\n".join(lines) + + # _format_learner_info_str method is inherited from DoubleML base class. @property def g_value(self): diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 88f677ef..72f3b44a 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -110,50 +110,87 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): self._i_rep = None self._i_treat = None - def __str__(self): + def _format_header_str(self): class_name = self.__class__.__name__ - header = f"================== {class_name} Object ==================\n" - data_summary = self._dml_data._data_summary_str() - score_info = f"Score function: {str(self.score)}\n" + return f"================== {class_name} Object ==================" + + def _format_score_info_str(self): + return f"Score function: {str(self.score)}" + + def _format_learner_info_str(self): learner_info = "" - for key, value in self.learner.items(): - learner_info += f"Learner {key}: {str(value)}\n" + if self.learner is not None: + for key, value in self.learner.items(): + learner_info += f"Learner {key}: {str(value)}\\n" if self.nuisance_loss is not None: - learner_info += "Out-of-sample Performance:\n" - is_classifier = [value for value in self._is_classifier.values()] - is_regressor = [not value for value in is_classifier] - if any(is_regressor): - learner_info += "Regression:\n" - for learner in [key for key, value in self._is_classifier.items() if value is False]: - learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n" - if any(is_classifier): - learner_info += "Classification:\n" - for learner in [key for key, value in self._is_classifier.items() if value is True]: - learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n" + learner_info += "Out-of-sample Performance:\\n" + # Check if _is_classifier is populated, otherwise, it might be called before fit + if self._is_classifier: + is_classifier_any = any(self._is_classifier.values()) + is_regressor_any = any(not v for v in self._is_classifier.values()) + + if is_regressor_any: + learner_info += "Regression:\\n" + for learner_name in self.params_names: # Iterate through known learners + if not self._is_classifier.get(learner_name, True): # Default to not regressor if not found + loss_val = self.nuisance_loss.get(learner_name, "N/A") + learner_info += f"Learner {learner_name} RMSE: {loss_val}\\n" + if is_classifier_any: + learner_info += "Classification:\\n" + for learner_name in self.params_names: # Iterate through known learners + if self._is_classifier.get(learner_name, False): # Default to not classifier if not found + loss_val = self.nuisance_loss.get(learner_name, "N/A") + learner_info += f"Learner {learner_name} Log Loss: {loss_val}\\n" + else: + learner_info += " (Run .fit() to see out-of-sample performance)\\n" + return learner_info.strip() + def _format_resampling_info_str(self): if self._is_cluster_data: - resampling_info = ( - f"No. folds per cluster: {self._n_folds_per_cluster}\n" - f"No. folds: {self.n_folds}\n" - f"No. repeated sample splits: {self.n_rep}\n" + return ( + f"No. folds per cluster: {self._n_folds_per_cluster}\\\\n" + f"No. folds: {self.n_folds}\\\\n" + f"No. repeated sample splits: {self.n_rep}" ) else: - resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n" - fit_summary = str(self.summary) - res = ( - header - + "\n------------------ Data summary ------------------\n" - + data_summary - + "\n------------------ Score & algorithm ------------------\n" - + score_info - + "\n------------------ Machine learner ------------------\n" - + learner_info - + "\n------------------ Resampling ------------------\n" - + resampling_info - + "\n------------------ Fit summary ------------------\n" - + fit_summary + return f"No. folds: {self.n_folds}\\\\nNo. repeated sample splits: {self.n_rep}" + + def _format_additional_info_str(self): + """ + Hook for subclasses to add additional information to the string representation. + Returns an empty string by default. + Subclasses should override this method to provide content. + The content should not include the 'Additional Information' header itself. + """ + return "" + + def __str__(self): + header = self._format_header_str() + # Assumes self._dml_data._data_summary_str() exists and is well-formed + data_summary = self._dml_data._data_summary_str() + score_info = self._format_score_info_str() + learner_info = self._format_learner_info_str() + resampling_info = self._format_resampling_info_str() + fit_summary = str(self.summary) # Assumes self.summary is well-formed + + representation = ( + f"{header}\\n" + f"\\n------------------ Data Summary ------------------\\n" + f"{data_summary}\\n" + f"\\n------------------ Score & Algorithm ------------------\\n" + f"{score_info}\\n" + f"\\n------------------ Machine Learner ------------------\\n" + f"{learner_info}\\n" + f"\\n------------------ Resampling ------------------\\n" + f"{resampling_info}\\n" + f"\\n------------------ Fit Summary ------------------\\n" + f"{fit_summary}" ) - return res + + additional_info = self._format_additional_info_str() + if additional_info: + representation += f"\\n\\n------------------ Additional Information ------------------\\n" f"{additional_info}" + return representation @property def n_folds(self): diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py index a43c0a03..b3cc11e7 100644 --- a/doubleml/irm/iivm.py +++ b/doubleml/irm/iivm.py @@ -197,22 +197,13 @@ def __init__( self.subgroups = subgroups self._external_predictions_implemented = True - def __str__(self): - parent_str = super().__str__() - - # add robust confset + def _format_additional_info_str(self): if self.framework is None: - confset_str = "" + return "" else: confset = self.robust_confset() formatted_confset = ", ".join([f"[{lower:.4f}, {upper:.4f}]" for lower, upper in confset]) - confset_str = ( - "\n\n--------------- Additional Information ----------------\n" - + f"Robust Confidence Set: {formatted_confset}\n" - ) - - res = parent_str + confset_str - return res + return f"Robust Confidence Set: {formatted_confset}" @property def normalize_ipw(self): From bf7e16af8a6b3dde11f7fd80c76549659b1e11a7 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 16 Jun 2025 12:09:09 +0200 Subject: [PATCH 71/84] fix format --- doubleml/double_ml.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 72f3b44a..694968bc 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -121,39 +121,39 @@ def _format_learner_info_str(self): learner_info = "" if self.learner is not None: for key, value in self.learner.items(): - learner_info += f"Learner {key}: {str(value)}\\n" + learner_info += f"Learner {key}: {str(value)}\n" if self.nuisance_loss is not None: - learner_info += "Out-of-sample Performance:\\n" + learner_info += "Out-of-sample Performance:\n" # Check if _is_classifier is populated, otherwise, it might be called before fit if self._is_classifier: is_classifier_any = any(self._is_classifier.values()) is_regressor_any = any(not v for v in self._is_classifier.values()) if is_regressor_any: - learner_info += "Regression:\\n" + learner_info += "Regression:\n" for learner_name in self.params_names: # Iterate through known learners if not self._is_classifier.get(learner_name, True): # Default to not regressor if not found loss_val = self.nuisance_loss.get(learner_name, "N/A") - learner_info += f"Learner {learner_name} RMSE: {loss_val}\\n" + learner_info += f"Learner {learner_name} RMSE: {loss_val}\n" if is_classifier_any: - learner_info += "Classification:\\n" + learner_info += "Classification:\n" for learner_name in self.params_names: # Iterate through known learners if self._is_classifier.get(learner_name, False): # Default to not classifier if not found loss_val = self.nuisance_loss.get(learner_name, "N/A") - learner_info += f"Learner {learner_name} Log Loss: {loss_val}\\n" + learner_info += f"Learner {learner_name} Log Loss: {loss_val}\n" else: - learner_info += " (Run .fit() to see out-of-sample performance)\\n" + learner_info += " (Run .fit() to see out-of-sample performance)\n" return learner_info.strip() def _format_resampling_info_str(self): if self._is_cluster_data: return ( - f"No. folds per cluster: {self._n_folds_per_cluster}\\\\n" - f"No. folds: {self.n_folds}\\\\n" + f"No. folds per cluster: {self._n_folds_per_cluster}\n" + f"No. folds: {self.n_folds}\n" f"No. repeated sample splits: {self.n_rep}" ) else: - return f"No. folds: {self.n_folds}\\\\nNo. repeated sample splits: {self.n_rep}" + return f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}" def _format_additional_info_str(self): """ @@ -174,22 +174,22 @@ def __str__(self): fit_summary = str(self.summary) # Assumes self.summary is well-formed representation = ( - f"{header}\\n" - f"\\n------------------ Data Summary ------------------\\n" - f"{data_summary}\\n" - f"\\n------------------ Score & Algorithm ------------------\\n" - f"{score_info}\\n" - f"\\n------------------ Machine Learner ------------------\\n" - f"{learner_info}\\n" - f"\\n------------------ Resampling ------------------\\n" - f"{resampling_info}\\n" - f"\\n------------------ Fit Summary ------------------\\n" + f"{header}\n" + f"\n------------------ Data Summary ------------------\n" + f"{data_summary}\n" + f"\n------------------ Score & Algorithm ------------------\n" + f"{score_info}\n" + f"\n------------------ Machine Learner ------------------\n" + f"{learner_info}\n" + f"\n------------------ Resampling ------------------\n" + f"{resampling_info}\n" + f"\n------------------ Fit Summary ------------------\n" f"{fit_summary}" ) additional_info = self._format_additional_info_str() if additional_info: - representation += f"\\n\\n------------------ Additional Information ------------------\\n" f"{additional_info}" + representation += f"\n\n------------------ Additional Information ------------------\n" f"{additional_info}" return representation @property From 6beebd83fd27b79aa445a348c96f44007bb8541f Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 12:32:13 +0200 Subject: [PATCH 72/84] fix unit tests --- doubleml/did/tests/test_did_external_predictions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/did/tests/test_did_external_predictions.py b/doubleml/did/tests/test_did_external_predictions.py index 7234be8e..194db374 100644 --- a/doubleml/did/tests/test_did_external_predictions.py +++ b/doubleml/did/tests/test_did_external_predictions.py @@ -24,7 +24,7 @@ def n_rep(request): @pytest.fixture(scope="module") def doubleml_did_fixture(did_score, n_rep): ext_predictions = {"d": {}} - dml_data = make_did_SZ2020(n_obs=500, return_type="DoubleMLData") + dml_data = make_did_SZ2020(n_obs=500, return_type="DoubleMLDIDData") all_smpls = draw_smpls(len(dml_data.y), 5, n_rep=n_rep, groups=dml_data.d) kwargs = {"obj_dml_data": dml_data, "score": did_score, "n_rep": n_rep, "draw_sample_splitting": False} dml_did = DoubleMLDID(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs) From fb421f7d6ba0009ae33d193d4dd0ae2dd4e3b849 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 12:46:36 +0200 Subject: [PATCH 73/84] adjust workflow in parent class `DoubleML` --- doubleml/double_ml.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index fe4cec5d..6d2f2ca4 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -7,7 +7,7 @@ from scipy.stats import norm from sklearn.base import is_classifier, is_regressor -from doubleml.data import DoubleMLClusterData, DoubleMLPanelData +from doubleml.data import DoubleMLPanelData, DoubleMLDIDData, DoubleMLSSMData, DoubleMLRDDData from doubleml.data.base_data import DoubleMLBaseData from doubleml.double_ml_framework import DoubleMLFramework from doubleml.utils._checks import _check_external_predictions, _check_sample_splitting @@ -30,13 +30,22 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." ) self._is_cluster_data = False - if isinstance(obj_dml_data, DoubleMLClusterData): + if obj_dml_data.is_cluster_data: if obj_dml_data.n_cluster_vars > 2: raise NotImplementedError("Multi-way (n_ways > 2) clustering not yet implemented.") self._is_cluster_data = True self._is_panel_data = False if isinstance(obj_dml_data, DoubleMLPanelData): self._is_panel_data = True + self._is_did_data = False + if isinstance(obj_dml_data, DoubleMLDIDData): + self._is_did_data = True + self._is_ssm_data = False + if isinstance(obj_dml_data, DoubleMLSSMData): + self._is_ssm_data = True + self._is_rdd_data = False + if isinstance(obj_dml_data, DoubleMLRDDData): + self._is_rdd_data = True self._dml_data = obj_dml_data self._n_obs = self._dml_data.n_obs From b11c0cbce798c6f8787bd5a0e7812b98b42c382a Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 13:30:24 +0200 Subject: [PATCH 74/84] update refactoring acc. to unit test results --- doubleml/double_ml.py | 2 +- doubleml/irm/apos.py | 6 +++--- doubleml/irm/qte.py | 5 ++--- doubleml/irm/ssm.py | 14 +++++++------- doubleml/irm/tests/test_apo_exceptions.py | 2 +- doubleml/irm/tests/test_apos_exceptions.py | 2 +- doubleml/irm/tests/test_ssm_exceptions.py | 2 +- .../datasets/dgp_pliv_multiway_cluster_CKMS2021.py | 4 ++-- doubleml/rdd/rdd.py | 14 +++++++------- doubleml/tests/test_datasets.py | 2 +- doubleml/tests/test_exceptions.py | 5 +++-- doubleml/tests/test_multiway_cluster.py | 2 +- doubleml/tests/test_nonlinear_cluster.py | 8 ++++---- doubleml/tests/test_return_types.py | 6 +++--- doubleml/tests/test_sensitivity_cluster.py | 4 ++-- doubleml/utils/_check_return_types.py | 5 ++--- 16 files changed, 41 insertions(+), 42 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 6d2f2ca4..818bb3ab 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -16,7 +16,7 @@ from doubleml.utils.gain_statistics import gain_statistics from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLResampling -_implemented_data_backends = ["DoubleMLData", "DoubleMLClusterData"] +_implemented_data_backends = ["DoubleMLData", "DoubleMLClusterData", "DoubleMLDIDData", "DoubleMLSSMData", "DoubleMLRDDData"] class DoubleML(ABC): diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py index 2960e90d..c272d0b4 100644 --- a/doubleml/irm/apos.py +++ b/doubleml/irm/apos.py @@ -6,7 +6,7 @@ from joblib import Parallel, delayed from sklearn.base import clone -from doubleml.data import DoubleMLClusterData, DoubleMLData +from doubleml.data import DoubleMLData from doubleml.double_ml import DoubleML from doubleml.double_ml_framework import concat from doubleml.irm.apo import DoubleMLAPO @@ -36,7 +36,7 @@ def __init__( draw_sample_splitting=True, ): self._dml_data = obj_dml_data - self._is_cluster_data = isinstance(obj_dml_data, DoubleMLClusterData) + self._is_cluster_data = obj_dml_data.is_cluster_data self._check_data(self._dml_data) self._all_treatment_levels = np.unique(self._dml_data.d) @@ -824,7 +824,7 @@ def _check_treatment_levels(self, treatment_levels): def _check_data(self, obj_dml_data): if not isinstance(obj_dml_data, DoubleMLData): - raise TypeError("The data must be of DoubleMLData or DoubleMLClusterData type.") + raise TypeError("The data must be of DoubleMLData type.") if obj_dml_data.z is not None: raise ValueError("The data must not contain instrumental variables.") return diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py index a2c803a3..de25b3ef 100644 --- a/doubleml/irm/qte.py +++ b/doubleml/irm/qte.py @@ -3,7 +3,7 @@ from joblib import Parallel, delayed from sklearn.base import clone -from doubleml.data import DoubleMLClusterData, DoubleMLData +from doubleml.data import DoubleMLData from doubleml.double_ml_framework import concat from doubleml.irm.cvar import DoubleMLCVAR from doubleml.irm.lpq import DoubleMLLPQ @@ -125,8 +125,7 @@ def __init__( # check data self._is_cluster_data = False - if isinstance(obj_dml_data, DoubleMLClusterData): - self._is_cluster_data = True + self._is_cluster_data = obj_dml_data.is_cluster_data self._check_data(self._dml_data) # initialize framework which is constructed after the fit method is called diff --git a/doubleml/irm/ssm.py b/doubleml/irm/ssm.py index c84b326d..2c8479a7 100644 --- a/doubleml/irm/ssm.py +++ b/doubleml/irm/ssm.py @@ -6,7 +6,7 @@ from sklearn.model_selection import train_test_split from sklearn.utils import check_X_y -from doubleml.data.base_data import DoubleMLData +from doubleml.data.ssm_data import DoubleMLSSMData from doubleml.double_ml import DoubleML from doubleml.double_ml_score_mixins import LinearScoreMixin from doubleml.utils._checks import _check_finite_predictions, _check_score, _check_trimming @@ -19,8 +19,8 @@ class DoubleMLSSM(LinearScoreMixin, DoubleML): Parameters ---------- - obj_dml_data : :class:`DoubleMLData` object - The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model. + obj_dml_data : :class:`DoubleMLSSMData` object + The :class:`DoubleMLSSMData` object providing the data and specifying the variables for the causal model. ml_g : estimator implementing ``fit()`` and ``predict()`` A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. @@ -66,7 +66,7 @@ class DoubleMLSSM(LinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml import DoubleMLData + >>> from doubleml import DoubleMLSSMData >>> from sklearn.linear_model import LassoCV, LogisticRegressionCV() >>> from sklearn.base import clone >>> np.random.seed(3146) @@ -82,7 +82,7 @@ class DoubleMLSSM(LinearScoreMixin, DoubleML): >>> s = np.where(np.dot(X, beta) + 0.25 * d + z + e[0] > 0, 1, 0) >>> y = np.dot(X, beta) + 0.5 * d + e[1] >>> y[s == 0] = 0 - >>> simul_data = DoubleMLData.from_arrays(X, y, d, z=None, t=s) + >>> simul_data = DoubleMLSSMData.from_arrays(X, y, d, z=None, s=s) >>> learner = LassoCV() >>> learner_class = LogisticRegressionCV() >>> ml_g_sim = clone(learner) @@ -183,9 +183,9 @@ def _initialize_ml_nuisance_params(self): self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in valid_learner} def _check_data(self, obj_dml_data): - if not isinstance(obj_dml_data, DoubleMLData): + if not isinstance(obj_dml_data, DoubleMLSSMData): raise TypeError( - f"The data must be of DoubleMLData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." + f"The data must be of DoubleMLSSMData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." ) if obj_dml_data.z_cols is not None and self._score == "missing-at-random": warnings.warn( diff --git a/doubleml/irm/tests/test_apo_exceptions.py b/doubleml/irm/tests/test_apo_exceptions.py index e643efca..cb267a98 100644 --- a/doubleml/irm/tests/test_apo_exceptions.py +++ b/doubleml/irm/tests/test_apo_exceptions.py @@ -22,7 +22,7 @@ @pytest.mark.ci def test_apo_exception_data(): - msg = "The data must be of DoubleMLData or DoubleMLClusterData type." + msg = "The data must be of DoubleMLData type." with pytest.raises(TypeError, match=msg): _ = DoubleMLAPO(pd.DataFrame(), ml_g, ml_m, treatment_level=0) diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py index f1c9b3d6..93274cee 100644 --- a/doubleml/irm/tests/test_apos_exceptions.py +++ b/doubleml/irm/tests/test_apos_exceptions.py @@ -20,7 +20,7 @@ @pytest.mark.ci def test_apos_exception_data(): - msg = "The data must be of DoubleMLData or DoubleMLClusterData type." + msg = "The data must be of DoubleMLData type." with pytest.raises(TypeError, match=msg): _ = DoubleMLAPOS(pd.DataFrame(), ml_g, ml_m, treatment_levels=0) diff --git a/doubleml/irm/tests/test_ssm_exceptions.py b/doubleml/irm/tests/test_ssm_exceptions.py index 50b082ec..ee67dbec 100644 --- a/doubleml/irm/tests/test_ssm_exceptions.py +++ b/doubleml/irm/tests/test_ssm_exceptions.py @@ -30,7 +30,7 @@ def n_coefs(self): @pytest.mark.ci def test_ssm_exception_data(): - msg = "The data must be of DoubleMLData or DoubleMLClusterData type." + msg = "The data must be of DoubleMLData type." with pytest.raises(TypeError, match=msg): _ = DoubleMLSSM(pd.DataFrame(), ml_g, ml_pi, ml_m) diff --git a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py index 39ff6a26..0d64c42f 100644 --- a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py +++ b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py @@ -62,8 +62,8 @@ def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return theta : The value of the causal parameter. return_type : - If ``'DoubleMLClusterData'`` or ``DoubleMLClusterData``, returns a ``DoubleMLClusterData`` object where - ``DoubleMLClusterData.data`` is a ``pd.DataFrame``. + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object where + ``DoubleMLData.data`` is a ``pd.DataFrame``. If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. diff --git a/doubleml/rdd/rdd.py b/doubleml/rdd/rdd.py index 858ae5ed..565f0241 100644 --- a/doubleml/rdd/rdd.py +++ b/doubleml/rdd/rdd.py @@ -7,7 +7,7 @@ from sklearn.base import clone from sklearn.utils.multiclass import type_of_target -from doubleml import DoubleMLData +from doubleml import DoubleMLRDDData from doubleml.double_ml import DoubleML from doubleml.rdd._utils import _is_rdrobust_available from doubleml.utils._checks import _check_resampling_specification, _check_supports_sample_weights @@ -82,7 +82,7 @@ class RDFlex: >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> np.random.seed(123) >>> data_dict = make_simple_rdd_data(fuzzy=True) - >>> obj_dml_data = dml.DoubleMLData.from_arrays(x=data_dict["X"], y=data_dict["Y"], d=data_dict["D"], s=data_dict["score"]) + >>> obj_dml_data = dml.DoubleMLRDDData.from_arrays(x=data_dict["X"], y=data_dict["Y"], d=data_dict["D"], s=data_dict["score"]) >>> ml_g = RandomForestRegressor() >>> ml_m = RandomForestClassifier() >>> rdflex_obj = dml.rdd.RDFlex(obj_dml_data, ml_g, ml_m, fuzzy=True) @@ -482,21 +482,21 @@ def _initialize_arrays(self): return M_Y, M_D, h, rdd_obj, all_coef, all_se, all_ci def _check_data(self, obj_dml_data, cutoff): - if not isinstance(obj_dml_data, DoubleMLData): + if not isinstance(obj_dml_data, DoubleMLRDDData): raise TypeError( - f"The data must be of DoubleMLData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." + f"The data must be of DoubleMLRDDData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." ) # score checks - if obj_dml_data.s_col is None: + if obj_dml_data.score_col is None: raise ValueError("Incompatible data. " + "Score variable has not been set. ") - is_continuous = type_of_target(obj_dml_data.s) == "continuous" + is_continuous = type_of_target(obj_dml_data.score) == "continuous" if not is_continuous: raise ValueError("Incompatible data. " + "Score variable has to be continuous. ") if not isinstance(cutoff, (int, float)): raise TypeError(f"Cutoff value has to be a float or int. Object of type {str(type(cutoff))} passed.") - if not (obj_dml_data.s.min() <= cutoff <= obj_dml_data.s.max()): + if not (obj_dml_data.score.min() <= cutoff <= obj_dml_data.score.max()): raise ValueError("Cutoff value is not within the range of the score variable. ") # treatment checks diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py index b31091a6..f69b681e 100644 --- a/doubleml/tests/test_datasets.py +++ b/doubleml/tests/test_datasets.py @@ -2,7 +2,7 @@ import pandas as pd import pytest -from doubleml import DoubleMLClusterData, DoubleMLData +from doubleml import DoubleMLData from doubleml.datasets import fetch_401K, fetch_bonus from doubleml.irm.datasets import ( make_confounded_irm_data, diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index 5178adc6..7839d7c4 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -54,7 +54,7 @@ @pytest.mark.ci def test_doubleml_exception_data(): - msg = "The data must be of DoubleMLData or DoubleMLClusterData type." + msg = "The data must be of DoubleMLData or DoubleMLClusterData or DoubleMLDIDData or DoubleMLSSMData or DoubleMLRDDData type." with pytest.raises(TypeError, match=msg): _ = DoubleMLPLR(pd.DataFrame(), ml_l, ml_m) @@ -1351,13 +1351,14 @@ def test_doubleml_cluster_not_yet_implemented(): df = dml_cluster_data_pliv.data.copy() df["cluster_var_k"] = df["cluster_var_i"] + df["cluster_var_j"] - 2 - dml_cluster_data_multiway = DoubleMLClusterData( + dml_cluster_data_multiway = DoubleMLData( df, y_col="Y", d_cols="D", x_cols=["X1", "X5"], z_cols="Z", cluster_cols=["cluster_var_i", "cluster_var_j", "cluster_var_k"], + is_cluster_data=True, ) assert dml_cluster_data_multiway.n_cluster_vars == 3 msg = r"Multi-way \(n_ways > 2\) clustering not yet implemented." diff --git a/doubleml/tests/test_multiway_cluster.py b/doubleml/tests/test_multiway_cluster.py index 10e5d445..4537cb4d 100644 --- a/doubleml/tests/test_multiway_cluster.py +++ b/doubleml/tests/test_multiway_cluster.py @@ -288,7 +288,7 @@ def dml_plr_cluster_with_index(generate_data1, learner): dml_plr_obj.fit() df = data.reset_index() - dml_cluster_data = dml.DoubleMLClusterData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index") + dml_cluster_data = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index", is_cluster_data=True) np.random.seed(3141) dml_plr_cluster_obj = dml.DoubleMLPLR(dml_cluster_data, ml_l, ml_m, n_folds=n_folds) np.random.seed(3141) diff --git a/doubleml/tests/test_nonlinear_cluster.py b/doubleml/tests/test_nonlinear_cluster.py index 71998941..6f19b511 100644 --- a/doubleml/tests/test_nonlinear_cluster.py +++ b/doubleml/tests/test_nonlinear_cluster.py @@ -7,7 +7,7 @@ from sklearn.linear_model import Lasso, LinearRegression import doubleml as dml -from doubleml import DoubleMLClusterData +from doubleml import DoubleMLData from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 from .test_nonlinear_score_mixin import DoubleMLPLRWithNonLinearScoreMixin @@ -20,7 +20,7 @@ # create data without insturment for plr x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x, return_type="array") -obj_dml_cluster_data = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars) +obj_dml_cluster_data = DoubleMLData.from_arrays(x, y, d, cluster_vars, is_cluster_data=True) x, y, d, cluster_vars, z = make_pliv_multiway_cluster_CKMS2021( N, @@ -32,7 +32,7 @@ omega_V=np.array([0.25, 0]), return_type="array", ) -obj_dml_oneway_cluster_data = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars) +obj_dml_oneway_cluster_data = DoubleMLData.from_arrays(x, y, d, cluster_vars, is_cluster_data=True) # only the first cluster variable is relevant with the weight setting above obj_dml_oneway_cluster_data.cluster_cols = "cluster_var1" @@ -196,7 +196,7 @@ def dml_plr_cluster_nonlinear_with_index(generate_data1, learner): dml_plr_obj.fit() df = data.reset_index() - dml_cluster_data = dml.DoubleMLClusterData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index") + dml_cluster_data = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index", is_cluster_data=True) np.random.seed(3141) dml_plr_cluster_obj = DoubleMLPLRWithNonLinearScoreMixin(dml_cluster_data, ml_l, ml_m, n_folds=n_folds) dml_plr_cluster_obj.fit() diff --git a/doubleml/tests/test_return_types.py b/doubleml/tests/test_return_types.py index 03676b74..fdb680f3 100644 --- a/doubleml/tests/test_return_types.py +++ b/doubleml/tests/test_return_types.py @@ -8,7 +8,7 @@ from doubleml import ( DoubleMLAPO, - DoubleMLClusterData, + DoubleMLData, DoubleMLCVAR, DoubleMLData, DoubleMLDID, @@ -86,14 +86,14 @@ def test_return_types(dml_obj, cls): if not dml_obj._is_cluster_data: assert isinstance(dml_obj.set_sample_splitting(dml_obj.smpls), cls) else: - assert isinstance(dml_obj._dml_data, DoubleMLClusterData) + assert dml_obj._dml_data.is_cluster_data assert isinstance(dml_obj.fit(), cls) assert isinstance(dml_obj.__str__(), str) # called again after fit, now with numbers assert isinstance(dml_obj.summary, pd.DataFrame) # called again after fit, now with numbers if not dml_obj._is_cluster_data: assert isinstance(dml_obj.bootstrap(), cls) else: - assert isinstance(dml_obj._dml_data, DoubleMLClusterData) + assert dml_obj._dml_data.is_cluster_data assert isinstance(dml_obj.confint(), pd.DataFrame) if not dml_obj._is_cluster_data: assert isinstance(dml_obj.p_adjust(), pd.DataFrame) diff --git a/doubleml/tests/test_sensitivity_cluster.py b/doubleml/tests/test_sensitivity_cluster.py index 83f8c270..5b6a7f1e 100644 --- a/doubleml/tests/test_sensitivity_cluster.py +++ b/doubleml/tests/test_sensitivity_cluster.py @@ -17,7 +17,7 @@ (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(N, M, dim_x, return_type="array") -obj_dml_cluster_data = dml.DoubleMLClusterData.from_arrays(x, y, d, cluster_vars) +obj_dml_cluster_data = dml.DoubleMData.from_arrays(x, y, d, cluster_vars, is_cluster_data=True) (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021( N, @@ -29,7 +29,7 @@ omega_V=np.array([0.25, 0]), return_type="array", ) -obj_dml_oneway_cluster_data = dml.DoubleMLClusterData.from_arrays(x, y, d, cluster_vars) +obj_dml_oneway_cluster_data = dml.DoubleMLData.from_arrays(x, y, d, cluster_vars, is_cluster_data=True) # only the first cluster variable is relevant with the weight setting above obj_dml_oneway_cluster_data.cluster_cols = "cluster_var1" diff --git a/doubleml/utils/_check_return_types.py b/doubleml/utils/_check_return_types.py index 54462059..5d93873e 100644 --- a/doubleml/utils/_check_return_types.py +++ b/doubleml/utils/_check_return_types.py @@ -3,7 +3,6 @@ import plotly from doubleml import DoubleMLFramework -from doubleml.data import DoubleMLClusterData from doubleml.double_ml_score_mixins import NonLinearScoreMixin @@ -15,14 +14,14 @@ def check_basic_return_types(dml_obj, cls): if not dml_obj._is_cluster_data: assert isinstance(dml_obj.set_sample_splitting(dml_obj.smpls), cls) else: - assert isinstance(dml_obj._dml_data, DoubleMLClusterData) + assert dml_obj._dml_data.is_cluster_data assert isinstance(dml_obj.fit(), cls) assert isinstance(dml_obj.__str__(), str) # called again after fit, now with numbers assert isinstance(dml_obj.summary, pd.DataFrame) # called again after fit, now with numbers if not dml_obj._is_cluster_data: assert isinstance(dml_obj.bootstrap(), cls) else: - assert isinstance(dml_obj._dml_data, DoubleMLClusterData) + assert dml_obj._dml_data.is_cluster_data assert isinstance(dml_obj.confint(), pd.DataFrame) if not dml_obj._is_cluster_data: assert isinstance(dml_obj.p_adjust(), pd.DataFrame) From b9bdf7c302b165d41429368996e60b5df15ffe0f Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 14:51:02 +0200 Subject: [PATCH 75/84] add check for correct data backend --- doubleml/did/did_cs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py index 7f33210f..7dab27ed 100644 --- a/doubleml/did/did_cs.py +++ b/doubleml/did/did_cs.py @@ -4,7 +4,7 @@ from sklearn.utils import check_X_y from sklearn.utils.multiclass import type_of_target -from doubleml.data.base_data import DoubleMLData +from doubleml.data.did_data import DoubleMLDIDData from doubleml.double_ml import DoubleML from doubleml.double_ml_score_mixins import LinearScoreMixin from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score, _check_trimming @@ -177,9 +177,9 @@ def _initialize_ml_nuisance_params(self): self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in valid_learner} def _check_data(self, obj_dml_data): - if not isinstance(obj_dml_data, DoubleMLData): + if not isinstance(obj_dml_data, DoubleMLDIDData): raise TypeError( - "For repeated cross sections the data must be of DoubleMLData type. " + "For repeated cross sections the data must be of DoubleMLDIDData type. " f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." ) if obj_dml_data.z_cols is not None: From 4f70523525731456a4148a72dd8d4b9b7b0a4e0c Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 14:51:17 +0200 Subject: [PATCH 76/84] renaming after refactoring --- doubleml/rdd/rdd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/rdd/rdd.py b/doubleml/rdd/rdd.py index 565f0241..195fbba4 100644 --- a/doubleml/rdd/rdd.py +++ b/doubleml/rdd/rdd.py @@ -115,7 +115,7 @@ def __init__( self._check_data(obj_dml_data, cutoff) self._dml_data = obj_dml_data - self._score = self._dml_data.s - cutoff + self._score = self._dml_data.score - cutoff self._cutoff = cutoff self._intendend_treatment = (self._score >= 0).astype(bool) self._fuzzy = fuzzy From 19eab819b7bc4504a008ecef967a586a9f8b1df9 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 14:51:41 +0200 Subject: [PATCH 77/84] adjust dummy data (is_cluster_data flag) --- doubleml/tests/_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doubleml/tests/_utils.py b/doubleml/tests/_utils.py index a241b58a..577ed7ed 100644 --- a/doubleml/tests/_utils.py +++ b/doubleml/tests/_utils.py @@ -9,8 +9,9 @@ class DummyDataClass(DoubleMLBaseData): - def __init__(self, data): + def __init__(self, data, is_cluster_data=False): DoubleMLBaseData.__init__(self, data) + self.is_cluster_data = is_cluster_data @property def n_coefs(self): From c3fbbb8f00a9ae54ccd44175d2fa1a16d7c5c3b3 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 14:51:47 +0200 Subject: [PATCH 78/84] adjust unit tests --- doubleml/rdd/tests/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doubleml/rdd/tests/conftest.py b/doubleml/rdd/tests/conftest.py index b279ea93..75c9272b 100644 --- a/doubleml/rdd/tests/conftest.py +++ b/doubleml/rdd/tests/conftest.py @@ -3,7 +3,7 @@ import pytest from sklearn.dummy import DummyClassifier, DummyRegressor -from doubleml import DoubleMLData +from doubleml import DoubleMLRDDData from doubleml.rdd import RDFlex from doubleml.rdd._utils import _is_rdrobust_available from doubleml.rdd.datasets import make_simple_rdd_data @@ -24,7 +24,7 @@ def predict_dummy(): - make predictions using rdrobust as a reference """ - def _predict_dummy(data: DoubleMLData, cutoff, alpha, n_rep, p, fs_specification, ml_g=ml_g_dummy): + def _predict_dummy(data: DoubleMLRDDData, cutoff, alpha, n_rep, p, fs_specification, ml_g=ml_g_dummy): dml_rdflex = RDFlex( data, ml_g=ml_g, ml_m=ml_m_dummy, cutoff=cutoff, n_rep=n_rep, p=p, fs_specification=fs_specification ) @@ -81,7 +81,7 @@ def generate_data(n_obs: int, fuzzy: str, cutoff: float, binary_outcome: bool = columns = ["y", "d", "score"] + ["x" + str(i) for i in range(data["X"].shape[1])] df = pd.DataFrame(np.column_stack((data["Y"], data["D"], data["score"], data["X"])), columns=columns) - return DoubleMLData(df, y_col="y", d_cols="d", s_col="score") + return DoubleMLRDDData(df, y_col="y", d_cols="d", score_col="score") @pytest.fixture(scope="module") From 144ee607f43c93a6c1165e3b904414147e6d53e5 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 15:50:56 +0200 Subject: [PATCH 79/84] adjust t_col setter for DIDData Backend --- doubleml/data/base_data.py | 3 ++- doubleml/data/did_data.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doubleml/data/base_data.py b/doubleml/data/base_data.py index 2297944e..88cf5379 100644 --- a/doubleml/data/base_data.py +++ b/doubleml/data/base_data.py @@ -263,7 +263,8 @@ def from_arrays( Default is ``True``. Examples - -------- >>> from doubleml import DoubleMLData + -------- + >>> from doubleml import DoubleMLData >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) diff --git a/doubleml/data/did_data.py b/doubleml/data/did_data.py index fd4fc7de..414cdc5b 100644 --- a/doubleml/data/did_data.py +++ b/doubleml/data/did_data.py @@ -300,7 +300,7 @@ def _check_disjoint_sets_t_col(self): def _set_time_var(self): """Set the time variable array.""" if hasattr(self, "_data") and self.t_col in self.data.columns: - self._t = self.data.loc[:, [self.t_col]] + self._t = self.data.loc[:, self.t_col] def _set_y_z_t(self): def _set_attr(col): From 70d67ad539a7a15fac8dbb968cf4c8d445c49e9d Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 16:15:31 +0200 Subject: [PATCH 80/84] fix RDDData (finally...) --- doubleml/data/rdd_data.py | 2 +- doubleml/rdd/tests/conftest.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doubleml/data/rdd_data.py b/doubleml/data/rdd_data.py index f19a4fa0..16f9e1c0 100644 --- a/doubleml/data/rdd_data.py +++ b/doubleml/data/rdd_data.py @@ -261,7 +261,7 @@ def _check_disjoint_sets_score_col(self): def _set_score_var(self): """Set the score variable array.""" if hasattr(self, "_data") and self.score_col in self.data.columns: - self._score = self.data.loc[:, [self.score_col]] + self._score = self.data.loc[:, self.score_col] def __str__(self): """String representation.""" diff --git a/doubleml/rdd/tests/conftest.py b/doubleml/rdd/tests/conftest.py index 75c9272b..9d13deaf 100644 --- a/doubleml/rdd/tests/conftest.py +++ b/doubleml/rdd/tests/conftest.py @@ -35,7 +35,7 @@ def _predict_dummy(data: DoubleMLRDDData, cutoff, alpha, n_rep, p, fs_specificat msg = "rdrobust is not installed. Please install it using 'pip install DoubleML[rdd]'" raise ImportError(msg) - rdrobust_model = rdrobust.rdrobust(y=data.y, x=data.s, c=cutoff, level=100 * (1 - alpha), p=p) + rdrobust_model = rdrobust.rdrobust(y=data.y, x=data.score, c=cutoff, level=100 * (1 - alpha), p=p) reference = { "model": rdrobust_model, From a322e359d5dc2e257fbb35c65f7e976569d337d7 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 16:31:26 +0200 Subject: [PATCH 81/84] adjsut RDD Class --- doubleml/rdd/rdd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doubleml/rdd/rdd.py b/doubleml/rdd/rdd.py index 195fbba4..045789c3 100644 --- a/doubleml/rdd/rdd.py +++ b/doubleml/rdd/rdd.py @@ -22,8 +22,8 @@ class RDFlex: Parameters ---------- - obj_dml_data : :class:`DoubleMLData` object - The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model. + obj_dml_data : :class:`DoubleMLRDDData` object + The :class:`DoubleMLRDDData` object providing the data and specifying the variables for the causal model. ml_g : estimator implementing ``fit()`` and ``predict()`` A machine learner implementing ``fit()`` and ``predict()`` methods and support ``sample_weights`` (e.g. From 0a9b3c7e32948aff252dc51a972c90425bdb521d Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 16:31:43 +0200 Subject: [PATCH 82/84] adjust DID classes --- doubleml/did/did.py | 12 ++++++------ doubleml/did/did_cs.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/doubleml/did/did.py b/doubleml/did/did.py index 170535ea..580d805e 100644 --- a/doubleml/did/did.py +++ b/doubleml/did/did.py @@ -4,7 +4,7 @@ from sklearn.utils import check_X_y from sklearn.utils.multiclass import type_of_target -from doubleml.data.base_data import DoubleMLData +from doubleml.data.did_data import DoubleMLDIDData from doubleml.double_ml import DoubleML from doubleml.double_ml_score_mixins import LinearScoreMixin from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score, _check_trimming @@ -17,8 +17,8 @@ class DoubleMLDID(LinearScoreMixin, DoubleML): Parameters ---------- - obj_dml_data : :class:`DoubleMLData` object - The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model. + obj_dml_data : :class:`DoubleMLDIDData` object + The :class:`DoubleMLDIDData` object providing the data and specifying the variables for the causal model. ml_g : estimator implementing ``fit()`` and ``predict()`` A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. @@ -71,7 +71,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML): >>> ml_g = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=5) >>> ml_m = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=5) >>> data = make_did_SZ2020(n_obs=500, return_type='DataFrame') - >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd') + >>> obj_dml_data = dml.DoubleMLDIDData(data, 'y', 'd') >>> dml_did_obj = dml.DoubleMLDID(obj_dml_data, ml_g, ml_m) >>> dml_did_obj.fit().summary coef std err t P>|t| 2.5 % 97.5 % @@ -176,9 +176,9 @@ def _initialize_ml_nuisance_params(self): self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in valid_learner} def _check_data(self, obj_dml_data): - if not isinstance(obj_dml_data, DoubleMLData): + if not isinstance(obj_dml_data, DoubleMLDIDData): raise TypeError( - "For repeated outcomes the data must be of DoubleMLData type. " + "For repeated outcomes the data must be of DoubleMLDIDData type. " f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." ) if obj_dml_data.z_cols is not None: diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py index bd7d59dd..38cc4952 100644 --- a/doubleml/did/did_cs.py +++ b/doubleml/did/did_cs.py @@ -17,8 +17,8 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML): Parameters ---------- - obj_dml_data : :class:`DoubleMLData` object - The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model. + obj_dml_data : :class:`DoubleMLDIDData` object + The :class:`DoubleMLDIDData` object providing the data and specifying the variables for the causal model. ml_g : estimator implementing ``fit()`` and ``predict()`` A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. @@ -71,7 +71,7 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML): >>> ml_g = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=5) >>> ml_m = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=5) >>> data = make_did_SZ2020(n_obs=500, cross_sectional_data=True, return_type='DataFrame') - >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd', t_col='t') + >>> obj_dml_data = dml.DoubleMLDIDData(data, 'y', 'd', t_col='t') >>> dml_did_obj = dml.DoubleMLDIDCS(obj_dml_data, ml_g, ml_m) >>> dml_did_obj.fit().summary coef std err t P>|t| 2.5 % 97.5 % From 37f11dced954198a6e455f1b8a182ec08c3a28a3 Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 16:32:08 +0200 Subject: [PATCH 83/84] Adjust unit tests for DID --- doubleml/did/tests/test_did.py | 4 ++-- doubleml/did/tests/test_did_binary_exceptions.py | 2 +- doubleml/did/tests/test_did_binary_tune.py | 2 +- doubleml/did/tests/test_did_binary_vs_did_panel.py | 2 +- doubleml/did/tests/test_did_binary_vs_did_two_period.py | 2 +- doubleml/did/tests/test_did_cs.py | 4 ++-- doubleml/did/tests/test_did_cs_binary_exceptions.py | 2 +- doubleml/did/tests/test_did_cs_binary_tune.py | 2 +- doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py | 2 +- .../did/tests/test_did_cs_binary_vs_did_cs_two_period.py | 2 +- doubleml/did/tests/test_did_tune.py | 2 +- doubleml/did/tests/test_return_types.py | 6 +++--- 12 files changed, 16 insertions(+), 16 deletions(-) diff --git a/doubleml/did/tests/test_did.py b/doubleml/did/tests/test_did.py index 90d53a95..79feb110 100644 --- a/doubleml/did/tests/test_did.py +++ b/doubleml/did/tests/test_did.py @@ -57,7 +57,7 @@ def dml_did_fixture(generate_data_did, learner, score, in_sample_normalization, np.random.seed(3141) n_obs = len(y) all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d) - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) + obj_dml_data = dml.DoubleMLDIDData.from_arrays(x, y, d) np.random.seed(3141) dml_did_obj = dml.DoubleMLDID( @@ -182,7 +182,7 @@ def test_dml_did_experimental(generate_data_did, in_sample_normalization, learne ml_m = clone(learner[1]) np.random.seed(3141) - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) + obj_dml_data = dml.DoubleMLDIDData.from_arrays(x, y, d) np.random.seed(3141) dml_did_obj_without_ml_m = dml.DoubleMLDID( diff --git a/doubleml/did/tests/test_did_binary_exceptions.py b/doubleml/did/tests/test_did_binary_exceptions.py index c7aa2395..78c09a94 100644 --- a/doubleml/did/tests/test_did_binary_exceptions.py +++ b/doubleml/did/tests/test_did_binary_exceptions.py @@ -85,7 +85,7 @@ def test_check_data_exceptions(): # Test 1: Data has to be DoubleMLPanelData invalid_data_types = [ - dml.data.DoubleMLData(df, y_col="Col_0", d_cols="Col_1"), + dml.data.DoubleMLDIDData(df, y_col="Col_0", d_cols="Col_1"), ] for invalid_data in invalid_data_types: diff --git a/doubleml/did/tests/test_did_binary_tune.py b/doubleml/did/tests/test_did_binary_tune.py index a817223f..0962aa5b 100644 --- a/doubleml/did/tests/test_did_binary_tune.py +++ b/doubleml/did/tests/test_did_binary_tune.py @@ -64,7 +64,7 @@ def dml_did_fixture(generate_data_did_binary, learner_g, learner_m, score, in_sa n_obs = df_panel.shape[0] all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=df_panel["d"]) - obj_dml_data = dml.DoubleMLData(df_panel, y_col="y", d_cols="d", x_cols=["Z1", "Z2", "Z3", "Z4"]) + obj_dml_data = dml.DoubleMLDIDData(df_panel, y_col="y", d_cols="d", x_cols=["Z1", "Z2", "Z3", "Z4"]) # Set machine learning methods for m & g ml_g = clone(learner_g) diff --git a/doubleml/did/tests/test_did_binary_vs_did_panel.py b/doubleml/did/tests/test_did_binary_vs_did_panel.py index 426b413c..2eddccaf 100644 --- a/doubleml/did/tests/test_did_binary_vs_did_panel.py +++ b/doubleml/did/tests/test_did_binary_vs_did_panel.py @@ -79,7 +79,7 @@ def dml_did_binary_vs_did_fixture(time_type, learner, score, in_sample_normaliza dml_did_binary_obj.fit() df_wide = dml_did_binary_obj.data_subset.copy() - dml_data = dml.data.DoubleMLData(df_wide, y_col="y_diff", d_cols="G_indicator", x_cols=["Z1", "Z2", "Z3", "Z4"]) + dml_data = dml.data.DoubleMLDIDData(df_wide, y_col="y_diff", d_cols="G_indicator", x_cols=["Z1", "Z2", "Z3", "Z4"]) dml_did_obj = dml.DoubleMLDID( dml_data, **dml_args, diff --git a/doubleml/did/tests/test_did_binary_vs_did_two_period.py b/doubleml/did/tests/test_did_binary_vs_did_two_period.py index 0db2a752..74575664 100644 --- a/doubleml/did/tests/test_did_binary_vs_did_two_period.py +++ b/doubleml/did/tests/test_did_binary_vs_did_two_period.py @@ -56,7 +56,7 @@ def dml_did_binary_vs_did_fixture(generate_data_did_binary, learner, score, in_s n_obs = df_panel.shape[0] all_smpls = draw_smpls(n_obs, n_folds) - obj_dml_data = dml.DoubleMLData(df_panel, y_col="y", d_cols="d", x_cols=["Z1", "Z2", "Z3", "Z4"]) + obj_dml_data = dml.DoubleMLDIDData(df_panel, y_col="y", d_cols="d", x_cols=["Z1", "Z2", "Z3", "Z4"]) # Set machine learning methods for m & g ml_g = clone(learner[0]) diff --git a/doubleml/did/tests/test_did_cs.py b/doubleml/did/tests/test_did_cs.py index ae633588..bc8e2da6 100644 --- a/doubleml/did/tests/test_did_cs.py +++ b/doubleml/did/tests/test_did_cs.py @@ -59,7 +59,7 @@ def dml_did_cs_fixture(generate_data_did_cs, learner, score, in_sample_normaliza n_obs = len(y) all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d + 2 * t) - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d, t=t) + obj_dml_data = dml.DoubleMLDIDData.from_arrays(x, y, d, t=t) np.random.seed(3141) dml_did_cs_obj = dml.DoubleMLDIDCS( @@ -185,7 +185,7 @@ def test_dml_did_cs_experimental(generate_data_did_cs, in_sample_normalization, ml_m = clone(learner[1]) np.random.seed(3141) - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d, t=t) + obj_dml_data = dml.DoubleMLDIDData.from_arrays(x, y, d, t=t) np.random.seed(3141) dml_did_obj_without_ml_m = dml.DoubleMLDIDCS( diff --git a/doubleml/did/tests/test_did_cs_binary_exceptions.py b/doubleml/did/tests/test_did_cs_binary_exceptions.py index b506da2d..e8d33939 100644 --- a/doubleml/did/tests/test_did_cs_binary_exceptions.py +++ b/doubleml/did/tests/test_did_cs_binary_exceptions.py @@ -85,7 +85,7 @@ def test_check_data_exceptions(): # Test 1: Data has to be DoubleMLPanelData invalid_data_types = [ - dml.data.DoubleMLData(df, y_col="Col_0", d_cols="Col_1"), + dml.data.DoubleMLDIDData(df, y_col="Col_0", d_cols="Col_1"), ] for invalid_data in invalid_data_types: diff --git a/doubleml/did/tests/test_did_cs_binary_tune.py b/doubleml/did/tests/test_did_cs_binary_tune.py index 0bd2c6ab..59db23dd 100644 --- a/doubleml/did/tests/test_did_cs_binary_tune.py +++ b/doubleml/did/tests/test_did_cs_binary_tune.py @@ -63,7 +63,7 @@ def dml_did_fixture(generate_data_did_binary, learner_g, learner_m, score, in_sa dml_panel_data = dml.data.DoubleMLPanelData( df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] ) - obj_dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) + obj_dml_data = dml.DoubleMLDIDData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) n_obs = df.shape[0] strata = df["d"] + 2 * df["t"] # only valid since it values are binary diff --git a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py index 8fab2615..da7db085 100644 --- a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py +++ b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py @@ -76,7 +76,7 @@ def dml_did_binary_vs_did_fixture(time_type, learner, score, in_sample_normaliza dml_did_binary_obj.fit() df_subset = dml_did_binary_obj.data_subset.copy() - dml_data = dml.data.DoubleMLData( + dml_data = dml.data.DoubleMLDIDData( df_subset, y_col="y", d_cols="G_indicator", x_cols=["Z1", "Z2", "Z3", "Z4"], t_col="t_indicator" ) dml_did_obj = dml.DoubleMLDIDCS( diff --git a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py index 73e6b827..b9e267ce 100644 --- a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py +++ b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py @@ -55,7 +55,7 @@ def dml_did_cs_binary_vs_did_cs_fixture(generate_data_did_binary, learner, score dml_panel_data = dml.data.DoubleMLPanelData( df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] ) - obj_dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) + obj_dml_data = dml.DoubleMLDIDData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) n_obs = df.shape[0] all_smpls = draw_smpls(n_obs, n_folds) diff --git a/doubleml/did/tests/test_did_tune.py b/doubleml/did/tests/test_did_tune.py index 16ec2ee8..25899301 100644 --- a/doubleml/did/tests/test_did_tune.py +++ b/doubleml/did/tests/test_did_tune.py @@ -65,7 +65,7 @@ def dml_did_fixture(generate_data_did, learner_g, learner_m, score, in_sample_no ml_m = clone(learner_m) np.random.seed(3141) - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) + obj_dml_data = dml.DoubleMLDIDData.from_arrays(x, y, d) dml_did_obj = dml.DoubleMLDID( obj_dml_data, ml_g, diff --git a/doubleml/did/tests/test_return_types.py b/doubleml/did/tests/test_return_types.py index 37105c3e..531a9706 100644 --- a/doubleml/did/tests/test_return_types.py +++ b/doubleml/did/tests/test_return_types.py @@ -3,7 +3,7 @@ import pytest from sklearn.linear_model import Lasso, LogisticRegression -from doubleml.data import DoubleMLData, DoubleMLPanelData +from doubleml.data import DoubleMLDIDData, DoubleMLPanelData from doubleml.did import DoubleMLDID, DoubleMLDIDBinary, DoubleMLDIDCS, DoubleMLDIDCSBinary from doubleml.did.datasets import make_did_CS2021, make_did_cs_CS2021, make_did_SZ2020 from doubleml.utils._check_return_types import ( @@ -37,8 +37,8 @@ (x, y, d, t) = make_did_SZ2020(n_obs=N_OBS, cross_sectional_data=True, return_type="array") binary_outcome = np.random.binomial(n=1, p=0.5, size=N_OBS) -datasets["did_binary_outcome"] = DoubleMLData.from_arrays(x, binary_outcome, d) -datasets["did_cs_binary_outcome"] = DoubleMLData.from_arrays(x, binary_outcome, d, t=t) +datasets["did_binary_outcome"] = DoubleMLDIDData.from_arrays(x, binary_outcome, d) +datasets["did_cs_binary_outcome"] = DoubleMLDIDData.from_arrays(x, binary_outcome, d, t=t) dml_objs = [ (DoubleMLDID(datasets["did"], Lasso(), LogisticRegression(), **dml_args), DoubleMLDID), From 7be2d8f84a67fb2bfae1b33fc09583d0eb3d27da Mon Sep 17 00:00:00 2001 From: Jan Teichert-Kluge Date: Tue, 17 Jun 2025 16:32:18 +0200 Subject: [PATCH 84/84] Adjust RDD unit tests --- doubleml/rdd/tests/test_rdd_exceptions.py | 8 ++++---- doubleml/rdd/tests/test_rdd_return_types.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doubleml/rdd/tests/test_rdd_exceptions.py b/doubleml/rdd/tests/test_rdd_exceptions.py index 6abf901e..71670793 100644 --- a/doubleml/rdd/tests/test_rdd_exceptions.py +++ b/doubleml/rdd/tests/test_rdd_exceptions.py @@ -6,7 +6,7 @@ from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.linear_model import Lasso, LogisticRegression -from doubleml import DoubleMLData +from doubleml import DoubleMLRDDData from doubleml.rdd import RDFlex from doubleml.rdd.datasets import make_simple_rdd_data @@ -17,7 +17,7 @@ columns=["y", "d", "score"] + ["x" + str(i) for i in range(data["X"].shape[1])], ) -dml_data = DoubleMLData(df, y_col="y", d_cols="d", s_col="score") +dml_data = DoubleMLRDDData(df, y_col="y", d_cols="d", s_col="score") ml_g = Lasso() ml_m = LogisticRegression() @@ -58,8 +58,8 @@ def predict_proba(self, X): @pytest.mark.ci_rdd def test_rdd_exception_data(): - # DoubleMLData - msg = r"The data must be of DoubleMLData type. \[\] of type was passed." + # DoubleMLRDDData + msg = r"The data must be of DoubleMLRDDData type. \[\] of type was passed." with pytest.raises(TypeError, match=msg): _ = RDFlex([], ml_g) diff --git a/doubleml/rdd/tests/test_rdd_return_types.py b/doubleml/rdd/tests/test_rdd_return_types.py index 13248afd..56f2bfe4 100644 --- a/doubleml/rdd/tests/test_rdd_return_types.py +++ b/doubleml/rdd/tests/test_rdd_return_types.py @@ -15,7 +15,7 @@ np.column_stack((data["Y"], data["D"], data["score"], data["X"])), columns=["y", "d", "score"] + ["x" + str(i) for i in range(data["X"].shape[1])], ) -dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", s_col="score") +dml_data = dml.DoubleMLRDDData(df, y_col="y", d_cols="d", s_col="score") def _assert_return_types(dml_obj):