-
Notifications
You must be signed in to change notification settings - Fork 47
Allow custom classifiers via ipw model parameter #177
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
8444951
946a710
919c2ba
bc391c8
2a8d6d6
8791583
2dd774f
c5e59cb
4125094
c1b7f52
48723f3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -399,19 +399,7 @@ def ipw( | |||||||
| target_df: pd.DataFrame, | ||||||||
| target_weights: pd.Series, | ||||||||
| variables: list[str] | None = None, | ||||||||
| # TODO: change 'model' to be Union[Optional[ClassifierMixin], str] | ||||||||
| # in which the default will be | ||||||||
| # LogisticRegression( | ||||||||
| # "penalty": "l2", | ||||||||
| # "solver": "lbfgs", | ||||||||
| # "tol": 1e-4, | ||||||||
| # "max_iter": 5000, | ||||||||
| # "warm_start": True, | ||||||||
| # ) | ||||||||
| # This will allow us to remove logistic_regression_kwargs and sklearn_model | ||||||||
| # a user could then just update the LogisticRegression by providing a different LogisticRegression implementation | ||||||||
| # Or any other sklearn classifier (e.g. RandomForestClassifier) | ||||||||
| model: str = "sklearn", | ||||||||
| model: str | ClassifierMixin | None = "sklearn", | ||||||||
| weight_trimming_mean_ratio: int | float | None = 20, | ||||||||
| weight_trimming_percentile: float | None = None, | ||||||||
| balance_classes: bool = True, | ||||||||
|
|
@@ -424,8 +412,6 @@ def ipw( | |||||||
| formula: str | list[str] | None = None, | ||||||||
| penalty_factor: list[float] | None = None, | ||||||||
| one_hot_encoding: bool = False, | ||||||||
| # TODO: This is set to be false in order to keep reproducibility of works that uses balance. | ||||||||
| # The best practice is for this to be true. | ||||||||
| logistic_regression_kwargs: Dict[str, Any] | None = None, | ||||||||
neuralsorcerer marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||||
| random_seed: int = 2020, | ||||||||
| sklearn_model: ClassifierMixin | None = None, | ||||||||
|
|
@@ -441,8 +427,16 @@ def ipw( | |||||||
| target_weights (pd.Series): design weights for target | ||||||||
| variables (Optional[List[str]], optional): list of variables to include in the model. | ||||||||
| If None all joint variables of sample_df and target_df are used. Defaults to None. | ||||||||
| model (str, optional): the model used for modeling the propensity scores. | ||||||||
| "sklearn" is logistic model. Defaults to "sklearn" (no current alternatives). | ||||||||
| model (Union[str, ClassifierMixin, None], optional): Model used for modeling the | ||||||||
| propensity scores. Provide "sklearn" (default) to use logistic regression, | ||||||||
| or pass an sklearn classifier implementing ``fit`` and ``predict_proba``. | ||||||||
| Common choices include :class:`sklearn.linear_model.LogisticRegression`, | ||||||||
| :class:`sklearn.ensemble.RandomForestClassifier`, | ||||||||
| :class:`sklearn.ensemble.GradientBoostingClassifier`, | ||||||||
| :class:`sklearn.ensemble.HistGradientBoostingClassifier`, and | ||||||||
| :class:`sklearn.linear_model.SGDClassifier` configured with | ||||||||
| ``loss="log_loss"``. Custom classifiers should expose a ``predict_proba`` | ||||||||
| method returning class probabilities. | ||||||||
neuralsorcerer marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||||
| weight_trimming_mean_ratio (Optional[Union[int, float]], optional): indicating the ratio from above according to which | ||||||||
| the weights are trimmed by mean(weights) * ratio. | ||||||||
| Defaults to 20. | ||||||||
|
|
@@ -478,19 +472,41 @@ def ipw( | |||||||
| model defaults to ``penalty="l2"``, ``solver="lbfgs"``, ``tol=1e-4``, | ||||||||
| ``max_iter=5000``, and ``warm_start=True``. Defaults to None. | ||||||||
| random_seed (int, optional): Random seed to use. Defaults to 2020. | ||||||||
| sklearn_model (Optional[ClassifierMixin], optional): Custom sklearn classifier | ||||||||
| to use for propensity modeling instead of the default logistic | ||||||||
| regression. The estimator must implement ``fit`` and | ||||||||
| ``predict_proba``. When provided, ``logistic_regression_kwargs`` and | ||||||||
| ``penalty_factor`` are ignored. Defaults to None. | ||||||||
| TODO: add list of (at least some of) the supported sklearn models | ||||||||
| TODO: add exampels in the docstring | ||||||||
| TODO: create a new tutorial quickstart_ipw (like this https://import-balance.org/docs/tutorials/quickstart/), | ||||||||
| that will include examples of the new supported models. | ||||||||
| sklearn_model (Optional[ClassifierMixin], optional): Deprecated alias for | ||||||||
neuralsorcerer marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||||
| providing a custom sklearn classifier. Use ``model`` instead. The | ||||||||
| estimator must implement ``fit`` and ``predict_proba``. When provided, | ||||||||
| ``logistic_regression_kwargs`` and ``penalty_factor`` are ignored. | ||||||||
| Defaults to None. | ||||||||
|
|
||||||||
| Examples: | ||||||||
| >>> from sklearn.ensemble import RandomForestClassifier | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. best to add an example that uses the simulated data.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated in 2a8d6d6
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. |
||||||||
| >>> rf = RandomForestClassifier(n_estimators=200, random_state=0) | ||||||||
| >>> ipw( | ||||||||
| ... sample_df, | ||||||||
| ... sample_weights, | ||||||||
| ... target_df, | ||||||||
| ... target_weights, | ||||||||
| ... variables=["gender", "age_group", "income"], | ||||||||
| ... model=rf, | ||||||||
| ... ) | ||||||||
|
|
||||||||
| >>> ipw( | ||||||||
| ... sample_df, | ||||||||
| ... sample_weights, | ||||||||
| ... target_df, | ||||||||
| ... target_weights, | ||||||||
| ... variables=["gender", "age_group", "income"], | ||||||||
| ... model="sklearn", | ||||||||
| ... logistic_regression_kwargs={"max_iter": 2000}, | ||||||||
| ... ) | ||||||||
|
|
||||||||
| Raises: | ||||||||
| Exception: f"Sample indicator only has value {_n_unique}. This can happen when your sample or target are empty from unknown reason" | ||||||||
| NotImplementedError: if model is not "sklearn" | ||||||||
| NotImplementedError: If ``model`` is a string other than "sklearn" (the | ||||||||
| built-in logistic regression option) or the deprecated "glmnet". | ||||||||
| TypeError: If ``model`` is neither a supported string nor an sklearn | ||||||||
| classifier exposing ``predict_proba``. | ||||||||
| ValueError: If both ``model`` and ``sklearn_model`` are provided. | ||||||||
|
|
||||||||
| Returns: | ||||||||
| Dict[str, Any]: A dictionary includes: | ||||||||
|
|
@@ -510,11 +526,31 @@ def ipw( | |||||||
| }, | ||||||||
| } | ||||||||
| """ | ||||||||
| if model == "glmnet": | ||||||||
| custom_model: ClassifierMixin | None = None | ||||||||
| model_name: str | None | ||||||||
neuralsorcerer marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||||
|
|
||||||||
| if isinstance(model, ClassifierMixin): | ||||||||
| custom_model = model | ||||||||
| model_name = "sklearn" | ||||||||
| elif model is None: | ||||||||
| model_name = "sklearn" | ||||||||
neuralsorcerer marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
| elif isinstance(model, str): | ||||||||
| model_name = model | ||||||||
| else: | ||||||||
| raise TypeError( | ||||||||
| "model must be 'sklearn', an sklearn classifier implementing predict_proba, or None" | ||||||||
|
||||||||
| "model must be 'sklearn', an sklearn classifier implementing predict_proba, or None" | |
| "model must be 'sklearn' (string), an sklearn classifier implementing predict_proba, or None (defaults to logistic regression)" |
neuralsorcerer marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no need to keep this as a variable. Just use
custom_model is None
And add as a comment # using_default_logistic
Copilot
AI
Nov 26, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The warning message should clarify that penalty_factor is only supported for the default logistic regression. Consider: 'penalty_factor is only supported with the default logistic regression model and will be ignored when using a custom classifier.'
| logger.warning("penalty_factor is ignored when using a custom model.") | |
| logger.warning("penalty_factor is only supported with the default logistic regression model and will be ignored when using a custom classifier.") |
Copilot
AI
Nov 26, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This error message could be more actionable. Consider: 'The provided custom model must implement the predict_proba method for propensity estimation. Ensure your classifier inherits from sklearn.base.ClassifierMixin and defines predict_proba.'
| "The provided custom model must implement predict_proba for propensity estimation." | |
| "The provided custom model must implement the predict_proba method for propensity estimation. " | |
| "Ensure your classifier inherits from sklearn.base.ClassifierMixin and defines predict_proba." |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -257,6 +257,31 @@ def test_ipw_supports_custom_sklearn_model(self) -> None: | |
| self.assertLessEqual(prop_dev, 1.0) | ||
| self.assertIsNotNone(result["model"]["regularisation_perf"]) | ||
|
|
||
| def test_ipw_supports_custom_model_parameter(self) -> None: | ||
| """The ``model`` parameter accepts sklearn classifiers directly.""" | ||
|
Comment on lines
+262
to
+263
|
||
|
|
||
| rng = np.random.RandomState(11) | ||
| sample = pd.DataFrame({"a": rng.normal(size=25), "b": rng.binomial(1, 0.3, 25)}) | ||
| target = pd.DataFrame({"a": rng.normal(size=40), "b": rng.binomial(1, 0.6, 40)}) | ||
|
|
||
| classifier = RandomForestClassifier( | ||
| n_estimators=10, max_depth=2, random_state=4 | ||
| ) | ||
| result = balance_ipw.ipw( | ||
| sample_df=sample, | ||
| sample_weights=pd.Series(np.ones(len(sample))), | ||
| target_df=target, | ||
| target_weights=pd.Series(np.ones(len(target))), | ||
| model=classifier, | ||
| transformations=None, | ||
| num_lambdas=1, | ||
| max_de=1.5, | ||
| ) | ||
|
|
||
| self.assertIsInstance(result["model"]["fit"], RandomForestClassifier) | ||
| self.assertTrue(np.isnan(result["model"]["lambda"])) | ||
| self.assertEqual(len(result["weight"]), len(sample)) | ||
|
|
||
| def test_ipw_supports_dense_only_estimators(self) -> None: | ||
| """Estimators that require dense matrices (e.g., GaussianNB) are supported.""" | ||
|
|
||
|
|
@@ -344,6 +369,29 @@ def predict_proba(self, X): # type: ignore[override] | |
| num_lambdas=1, | ||
| ) | ||
|
|
||
| def test_ipw_rejects_custom_models_without_binary_classes(self) -> None: | ||
| """Custom models must be trained on labels containing both 0 and 1.""" | ||
|
|
||
| class ShiftedProbabilityModel(LogisticRegression): | ||
| def fit(self, X, y, sample_weight=None): # type: ignore[override] | ||
| super().fit(X, y + 2, sample_weight=sample_weight) | ||
| return self | ||
|
|
||
| rng = np.random.RandomState(12) | ||
| sample = pd.DataFrame({"a": rng.normal(size=30)}) | ||
| target = pd.DataFrame({"a": rng.normal(size=40)}) | ||
|
|
||
| with self.assertRaisesRegex(ValueError, "must be trained on the binary labels"): | ||
| balance_ipw.ipw( | ||
| sample_df=sample, | ||
| sample_weights=pd.Series(np.ones(len(sample))), | ||
| target_df=target, | ||
| target_weights=pd.Series(np.ones(len(target))), | ||
| model=ShiftedProbabilityModel(max_iter=50), | ||
| transformations=None, | ||
| num_lambdas=1, | ||
| ) | ||
|
|
||
| def test_ipw_rejects_logistic_kwargs_with_custom_model(self) -> None: | ||
| """Providing logistic_regression_kwargs with custom model raises an error.""" | ||
|
|
||
|
|
@@ -386,6 +434,41 @@ def test_ipw_warns_when_penalty_factor_with_custom_model(self) -> None: | |
| any("penalty_factor is ignored" in message for message in logs.output) | ||
| ) | ||
|
|
||
| def test_ipw_rejects_conflicting_model_arguments(self) -> None: | ||
| """Supplying both model and sklearn_model triggers a clear error.""" | ||
|
|
||
| sample = pd.DataFrame({"a": (0, 1, 1, 0)}) | ||
| target = pd.DataFrame({"a": (1, 0, 0, 1)}) | ||
|
|
||
| with self.assertRaisesRegex(ValueError, "either 'model' or 'sklearn_model'"): | ||
| balance_ipw.ipw( | ||
| sample_df=sample, | ||
| sample_weights=pd.Series((1,) * len(sample)), | ||
| target_df=target, | ||
| target_weights=pd.Series((1,) * len(target)), | ||
| model=LogisticRegression(), | ||
| sklearn_model=LogisticRegression(), | ||
| transformations=None, | ||
| num_lambdas=1, | ||
| ) | ||
|
|
||
| def test_ipw_rejects_unknown_model_identifier(self) -> None: | ||
| """Non-supported model identifiers raise NotImplementedError.""" | ||
|
|
||
| sample = pd.DataFrame({"a": (0, 1)}) | ||
| target = pd.DataFrame({"a": (1, 0)}) | ||
|
|
||
| with self.assertRaises(NotImplementedError): | ||
| balance_ipw.ipw( | ||
| sample_df=sample, | ||
| sample_weights=pd.Series((1,) * len(sample)), | ||
| target_df=target, | ||
| target_weights=pd.Series((1,) * len(target)), | ||
| model="unsupported-model", | ||
| transformations=None, | ||
| num_lambdas=1, | ||
| ) | ||
|
|
||
| def test_model_coefs_handles_linear_and_non_linear_estimators(self) -> None: | ||
| """model_coefs returns coefficients for linear models and empty series otherwise.""" | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.