facebookresearch · neuralsorcerer · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,11 +17,12 @@
     LogisticRegression parameters via JSON
     ([#138](https://github.com/facebookresearch/balance/pull/138)).
 - **Propensity modeling flexibility**
-  - `ipw()` now accepts any sklearn classifier via the new `sklearn_model`
-    argument, enabling the use of models like random forests while preserving
-    all existing trimming and diagnostic workflows. Dense-only estimators and
-    models without linear coefficients are fully supported, and propensity
-    probabilities are stabilized to avoid numerical issues.
+  - `ipw()` now accepts any sklearn classifier via the `model` argument and
+    deprecates the old `sklearn_model` alias, enabling the use of models like
+    random forests while preserving all existing trimming and diagnostic
+    workflows. Dense-only estimators and models without linear coefficients are
+    fully supported, and propensity probabilities are stabilized to avoid
+    numerical issues.
 
 ## Documentation
 
@@ -36,6 +37,8 @@
 - Added project badges to README for build status, Python version support, and
   release tracking
   ([#145](https://github.com/facebookresearch/balance/pull/145)).
+- Added IPW quickstart tutorial showcasing default logistic regression and
+  custom sklearn classifier usage (`balance_quickstart_ipw.ipynb`).
 
 ## Code Quality & Refactoring
 

diff --git a/balance/weighting_methods/ipw.py b/balance/weighting_methods/ipw.py
@@ -399,19 +399,7 @@ def ipw(
     target_df: pd.DataFrame,
     target_weights: pd.Series,
     variables: list[str] | None = None,
-    # TODO: change 'model' to be Union[Optional[ClassifierMixin], str]
-    #       in which the default will be
-    # LogisticRegression(
-    #     "penalty": "l2",
-    #     "solver": "lbfgs",
-    #     "tol": 1e-4,
-    #     "max_iter": 5000,
-    #     "warm_start": True,
-    # )
-    # This will allow us to remove logistic_regression_kwargs and sklearn_model
-    # a user could then just update the LogisticRegression by providing a different LogisticRegression implementation
-    # Or any other sklearn classifier (e.g. RandomForestClassifier)
-    model: str = "sklearn",
+    model: str | ClassifierMixin | None = "sklearn",
     weight_trimming_mean_ratio: int | float | None = 20,
     weight_trimming_percentile: float | None = None,
     balance_classes: bool = True,
@@ -424,8 +412,6 @@ def ipw(
     formula: str | list[str] | None = None,
     penalty_factor: list[float] | None = None,
     one_hot_encoding: bool = False,
-    # TODO: This is set to be false in order to keep reproducibility of works that uses balance.
-    # The best practice is for this to be true.
     logistic_regression_kwargs: Dict[str, Any] | None = None,
     random_seed: int = 2020,
     sklearn_model: ClassifierMixin | None = None,
@@ -441,8 +427,16 @@ def ipw(
         target_weights (pd.Series): design weights for target
         variables (Optional[List[str]], optional): list of variables to include in the model.
             If None all joint variables of sample_df and target_df are used. Defaults to None.
-        model (str, optional): the model used for modeling the propensity scores.
-            "sklearn" is logistic model. Defaults to "sklearn" (no current alternatives).
+        model (Union[str, ClassifierMixin, None], optional): Model used for modeling the
+            propensity scores. Provide "sklearn" (default) to use logistic regression,
+            or pass an sklearn classifier implementing ``fit`` and ``predict_proba``.
+            Common choices include :class:`sklearn.linear_model.LogisticRegression`,
+            :class:`sklearn.ensemble.RandomForestClassifier`,
+            :class:`sklearn.ensemble.GradientBoostingClassifier`,
+            :class:`sklearn.ensemble.HistGradientBoostingClassifier`, and
+            :class:`sklearn.linear_model.SGDClassifier` configured with
+            ``loss="log_loss"``. Custom classifiers should expose a ``predict_proba``
+            method returning class probabilities.
         weight_trimming_mean_ratio (Optional[Union[int, float]], optional): indicating the ratio from above according to which
             the weights are trimmed by mean(weights) * ratio.
             Defaults to 20.
@@ -478,19 +472,41 @@ def ipw(
             model defaults to ``penalty="l2"``, ``solver="lbfgs"``, ``tol=1e-4``,
             ``max_iter=5000``, and ``warm_start=True``. Defaults to None.
         random_seed (int, optional): Random seed to use. Defaults to 2020.
-        sklearn_model (Optional[ClassifierMixin], optional): Custom sklearn classifier
-            to use for propensity modeling instead of the default logistic
-            regression. The estimator must implement ``fit`` and
-            ``predict_proba``. When provided, ``logistic_regression_kwargs`` and
-            ``penalty_factor`` are ignored. Defaults to None.
-            TODO: add list of (at least some of) the supported sklearn models
-            TODO: add exampels in the docstring
-            TODO: create a new tutorial quickstart_ipw (like this https://import-balance.org/docs/tutorials/quickstart/),
-                  that will include examples of the new supported models.
+        sklearn_model (Optional[ClassifierMixin], optional): Deprecated alias for
+            providing a custom sklearn classifier. Use ``model`` instead. The
+            estimator must implement ``fit`` and ``predict_proba``. When provided,
+            ``logistic_regression_kwargs`` and ``penalty_factor`` are ignored.
+            Defaults to None.
+
+    Examples:
+        >>> from sklearn.ensemble import RandomForestClassifier
+        >>> rf = RandomForestClassifier(n_estimators=200, random_state=0)
+        >>> ipw(
+        ...     sample_df,
+        ...     sample_weights,
+        ...     target_df,
+        ...     target_weights,
+        ...     variables=["gender", "age_group", "income"],
+        ...     model=rf,
+        ... )
+
+        >>> ipw(
+        ...     sample_df,
+        ...     sample_weights,
+        ...     target_df,
+        ...     target_weights,
+        ...     variables=["gender", "age_group", "income"],
+        ...     model="sklearn",
+        ...     logistic_regression_kwargs={"max_iter": 2000},
+        ... )
 
     Raises:
         Exception: f"Sample indicator only has value {_n_unique}. This can happen when your sample or target are empty from unknown reason"
-        NotImplementedError: if model is not "sklearn"
+        NotImplementedError: If ``model`` is a string other than "sklearn" (the
+            built-in logistic regression option) or the deprecated "glmnet".
+        TypeError: If ``model`` is neither a supported string nor an sklearn
+            classifier exposing ``predict_proba``.
+        ValueError: If both ``model`` and ``sklearn_model`` are provided.
 
     Returns:
         Dict[str, Any]: A dictionary includes:
@@ -510,11 +526,31 @@ def ipw(
                 },
             }
     """
-    if model == "glmnet":
+    custom_model: ClassifierMixin | None = None
+    model_name: str | None
+
+    if isinstance(model, ClassifierMixin):
+        custom_model = model
+        model_name = "sklearn"
+    elif model is None:
+        model_name = "sklearn"
+    elif isinstance(model, str):
+        model_name = model
+    else:
+        raise TypeError(
+            "model must be 'sklearn', an sklearn classifier implementing predict_proba, or None"
-            "model must be 'sklearn', an sklearn classifier implementing predict_proba, or None"
+            "model must be 'sklearn' (string), an sklearn classifier implementing predict_proba, or None (defaults to logistic regression)"
-            "model must be 'sklearn', an sklearn classifier implementing predict_proba, or None"
+            "model must be 'sklearn' (string), an sklearn classifier implementing predict_proba, or None (defaults to logistic regression)"
+        )
+
+    if sklearn_model is not None:
+        if custom_model is not None:
+            raise ValueError("Provide either 'model' or 'sklearn_model', not both.")
+        custom_model = sklearn_model
+
+    if model_name == "glmnet":
         raise NotImplementedError("glmnet is no longer supported")
-    elif model != "sklearn":
+    elif model_name != "sklearn":
         raise NotImplementedError(
-            f"Model '{model}' is not supported. Only 'sklearn' is currently implemented."
+            f"Model '{model_name}' is not supported. Only 'sklearn' is currently implemented."
         )
 
     logger.info("Starting ipw function")
@@ -618,7 +654,7 @@ def ipw(
         model_weights,
     )
 
-    using_default_logistic = sklearn_model is None
+    using_default_logistic = custom_model is None
 
     if using_default_logistic:
         # Standardize columns of the X matrix and penalize the columns of the X matrix according to the penalty_factor.
@@ -705,17 +741,15 @@ def ipw(
     else:
         if logistic_regression_kwargs is not None:
             raise ValueError(
-                "logistic_regression_kwargs cannot be used when providing a custom sklearn_model"
+                "logistic_regression_kwargs cannot be used when providing a custom model"
             )
         if penalty_factor is not None:
-            logger.warning(
-                "penalty_factor is ignored when using a custom sklearn_model."
-            )
+            logger.warning("penalty_factor is ignored when using a custom model.")
-            logger.warning("penalty_factor is ignored when using a custom model.")
+            logger.warning("penalty_factor is only supported with the default logistic regression model and will be ignored when using a custom classifier.")
-            logger.warning("penalty_factor is ignored when using a custom model.")
+            logger.warning("penalty_factor is only supported with the default logistic regression model and will be ignored when using a custom classifier.")
 
-        custom_model = clone(cast(ClassifierMixin, sklearn_model))
+        custom_model = clone(cast(ClassifierMixin, custom_model))
         if not hasattr(custom_model, "predict_proba"):
             raise ValueError(
-                "The provided sklearn_model must implement predict_proba for propensity estimation."
+                "The provided custom model must implement predict_proba for propensity estimation."
-                "The provided custom model must implement predict_proba for propensity estimation."
+                "The provided custom model must implement the predict_proba method for propensity estimation. "
+                "Ensure your classifier inherits from sklearn.base.ClassifierMixin and defines predict_proba."
-                "The provided custom model must implement predict_proba for propensity estimation."
+                "The provided custom model must implement the predict_proba method for propensity estimation. "
+                "Ensure your classifier inherits from sklearn.base.ClassifierMixin and defines predict_proba."
             )
 
         X_matrix = _convert_to_dense_array(X_matrix)
@@ -732,13 +766,13 @@ def ipw(
         probas = model.predict_proba(X_matrix)
         if probas.ndim != 2 or probas.shape[1] < 2:
             raise ValueError(
-                "The provided sklearn_model.predict_proba must return probability estimates for both classes."
+                "The provided custom model predict_proba must return probability estimates for both classes."
             )
         try:
             class_index = list(model.classes_).index(1)
         except ValueError as error:
             raise ValueError(
-                "The provided sklearn_model must be trained on the binary labels {0, 1}."
+                "The provided custom model must be trained on the binary labels {0, 1}."
             ) from error
         pred = probas[:, class_index]
         dev[0] = _compute_deviance(y, pred, model_weights)

diff --git a/tests/test_ipw.py b/tests/test_ipw.py
@@ -257,6 +257,31 @@ def test_ipw_supports_custom_sklearn_model(self) -> None:
         self.assertLessEqual(prop_dev, 1.0)
         self.assertIsNotNone(result["model"]["regularisation_perf"])
 
+    def test_ipw_supports_custom_model_parameter(self) -> None:
+        """The ``model`` parameter accepts sklearn classifiers directly."""
+
+        rng = np.random.RandomState(11)
+        sample = pd.DataFrame({"a": rng.normal(size=25), "b": rng.binomial(1, 0.3, 25)})
+        target = pd.DataFrame({"a": rng.normal(size=40), "b": rng.binomial(1, 0.6, 40)})
+
+        classifier = RandomForestClassifier(
+            n_estimators=10, max_depth=2, random_state=4
+        )
+        result = balance_ipw.ipw(
+            sample_df=sample,
+            sample_weights=pd.Series(np.ones(len(sample))),
+            target_df=target,
+            target_weights=pd.Series(np.ones(len(target))),
+            model=classifier,
+            transformations=None,
+            num_lambdas=1,
+            max_de=1.5,
+        )
+
+        self.assertIsInstance(result["model"]["fit"], RandomForestClassifier)
+        self.assertTrue(np.isnan(result["model"]["lambda"]))
+        self.assertEqual(len(result["weight"]), len(sample))
+
     def test_ipw_supports_dense_only_estimators(self) -> None:
         """Estimators that require dense matrices (e.g., GaussianNB) are supported."""
 
@@ -344,6 +369,29 @@ def predict_proba(self, X):  # type: ignore[override]
                 num_lambdas=1,
             )
 
+    def test_ipw_rejects_custom_models_without_binary_classes(self) -> None:
+        """Custom models must be trained on labels containing both 0 and 1."""
+
+        class ShiftedProbabilityModel(LogisticRegression):
+            def fit(self, X, y, sample_weight=None):  # type: ignore[override]
+                super().fit(X, y + 2, sample_weight=sample_weight)
+                return self
+
+        rng = np.random.RandomState(12)
+        sample = pd.DataFrame({"a": rng.normal(size=30)})
+        target = pd.DataFrame({"a": rng.normal(size=40)})
+
+        with self.assertRaisesRegex(ValueError, "must be trained on the binary labels"):
+            balance_ipw.ipw(
+                sample_df=sample,
+                sample_weights=pd.Series(np.ones(len(sample))),
+                target_df=target,
+                target_weights=pd.Series(np.ones(len(target))),
+                model=ShiftedProbabilityModel(max_iter=50),
+                transformations=None,
+                num_lambdas=1,
+            )
+
     def test_ipw_rejects_logistic_kwargs_with_custom_model(self) -> None:
         """Providing logistic_regression_kwargs with custom model raises an error."""
 
@@ -386,6 +434,41 @@ def test_ipw_warns_when_penalty_factor_with_custom_model(self) -> None:
             any("penalty_factor is ignored" in message for message in logs.output)
         )
 
+    def test_ipw_rejects_conflicting_model_arguments(self) -> None:
+        """Supplying both model and sklearn_model triggers a clear error."""
+
+        sample = pd.DataFrame({"a": (0, 1, 1, 0)})
+        target = pd.DataFrame({"a": (1, 0, 0, 1)})
+
+        with self.assertRaisesRegex(ValueError, "either 'model' or 'sklearn_model'"):
+            balance_ipw.ipw(
+                sample_df=sample,
+                sample_weights=pd.Series((1,) * len(sample)),
+                target_df=target,
+                target_weights=pd.Series((1,) * len(target)),
+                model=LogisticRegression(),
+                sklearn_model=LogisticRegression(),
+                transformations=None,
+                num_lambdas=1,
+            )
+
+    def test_ipw_rejects_unknown_model_identifier(self) -> None:
+        """Non-supported model identifiers raise NotImplementedError."""
+
+        sample = pd.DataFrame({"a": (0, 1)})
+        target = pd.DataFrame({"a": (1, 0)})
+
+        with self.assertRaises(NotImplementedError):
+            balance_ipw.ipw(
+                sample_df=sample,
+                sample_weights=pd.Series((1,) * len(sample)),
+                target_df=target,
+                target_weights=pd.Series((1,) * len(target)),
+                model="unsupported-model",
+                transformations=None,
+                num_lambdas=1,
+            )
+
     def test_model_coefs_handles_linear_and_non_linear_estimators(self) -> None:
         """model_coefs returns coefficients for linear models and empty series otherwise."""