Skip to content
13 changes: 8 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
LogisticRegression parameters via JSON
([#138](https://github.com/facebookresearch/balance/pull/138)).
- **Propensity modeling flexibility**
- `ipw()` now accepts any sklearn classifier via the new `sklearn_model`
argument, enabling the use of models like random forests while preserving
all existing trimming and diagnostic workflows. Dense-only estimators and
models without linear coefficients are fully supported, and propensity
probabilities are stabilized to avoid numerical issues.
- `ipw()` now accepts any sklearn classifier via the `model` argument and
deprecates the old `sklearn_model` alias, enabling the use of models like
random forests while preserving all existing trimming and diagnostic
workflows. Dense-only estimators and models without linear coefficients are
fully supported, and propensity probabilities are stabilized to avoid
numerical issues.

## Documentation

Expand All @@ -36,6 +37,8 @@
- Added project badges to README for build status, Python version support, and
release tracking
([#145](https://github.com/facebookresearch/balance/pull/145)).
- Added IPW quickstart tutorial showcasing default logistic regression and
custom sklearn classifier usage (`balance_quickstart_ipw.ipynb`).

## Code Quality & Refactoring

Expand Down
112 changes: 73 additions & 39 deletions balance/weighting_methods/ipw.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,19 +399,7 @@ def ipw(
target_df: pd.DataFrame,
target_weights: pd.Series,
variables: list[str] | None = None,
# TODO: change 'model' to be Union[Optional[ClassifierMixin], str]
# in which the default will be
# LogisticRegression(
# "penalty": "l2",
# "solver": "lbfgs",
# "tol": 1e-4,
# "max_iter": 5000,
# "warm_start": True,
# )
# This will allow us to remove logistic_regression_kwargs and sklearn_model
# a user could then just update the LogisticRegression by providing a different LogisticRegression implementation
# Or any other sklearn classifier (e.g. RandomForestClassifier)
model: str = "sklearn",
model: str | ClassifierMixin | None = "sklearn",
weight_trimming_mean_ratio: int | float | None = 20,
weight_trimming_percentile: float | None = None,
balance_classes: bool = True,
Expand All @@ -424,8 +412,6 @@ def ipw(
formula: str | list[str] | None = None,
penalty_factor: list[float] | None = None,
one_hot_encoding: bool = False,
# TODO: This is set to be false in order to keep reproducibility of works that uses balance.
# The best practice is for this to be true.
logistic_regression_kwargs: Dict[str, Any] | None = None,
random_seed: int = 2020,
sklearn_model: ClassifierMixin | None = None,
Expand All @@ -441,8 +427,16 @@ def ipw(
target_weights (pd.Series): design weights for target
variables (Optional[List[str]], optional): list of variables to include in the model.
If None all joint variables of sample_df and target_df are used. Defaults to None.
model (str, optional): the model used for modeling the propensity scores.
"sklearn" is logistic model. Defaults to "sklearn" (no current alternatives).
model (Union[str, ClassifierMixin, None], optional): Model used for modeling the
propensity scores. Provide "sklearn" (default) to use logistic regression,
or pass an sklearn classifier implementing ``fit`` and ``predict_proba``.
Common choices include :class:`sklearn.linear_model.LogisticRegression`,
:class:`sklearn.ensemble.RandomForestClassifier`,
:class:`sklearn.ensemble.GradientBoostingClassifier`,
:class:`sklearn.ensemble.HistGradientBoostingClassifier`, and
:class:`sklearn.linear_model.SGDClassifier` configured with
``loss="log_loss"``. Custom classifiers should expose a ``predict_proba``
method returning class probabilities.
weight_trimming_mean_ratio (Optional[Union[int, float]], optional): indicating the ratio from above according to which
the weights are trimmed by mean(weights) * ratio.
Defaults to 20.
Expand Down Expand Up @@ -478,19 +472,41 @@ def ipw(
model defaults to ``penalty="l2"``, ``solver="lbfgs"``, ``tol=1e-4``,
``max_iter=5000``, and ``warm_start=True``. Defaults to None.
random_seed (int, optional): Random seed to use. Defaults to 2020.
sklearn_model (Optional[ClassifierMixin], optional): Custom sklearn classifier
to use for propensity modeling instead of the default logistic
regression. The estimator must implement ``fit`` and
``predict_proba``. When provided, ``logistic_regression_kwargs`` and
``penalty_factor`` are ignored. Defaults to None.
TODO: add list of (at least some of) the supported sklearn models
TODO: add exampels in the docstring
TODO: create a new tutorial quickstart_ipw (like this https://import-balance.org/docs/tutorials/quickstart/),
that will include examples of the new supported models.
sklearn_model (Optional[ClassifierMixin], optional): Deprecated alias for
providing a custom sklearn classifier. Use ``model`` instead. The
estimator must implement ``fit`` and ``predict_proba``. When provided,
``logistic_regression_kwargs`` and ``penalty_factor`` are ignored.
Defaults to None.

Examples:
>>> from sklearn.ensemble import RandomForestClassifier
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

best to add an example that uses the simulated data.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated in 2a8d6d6

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks.
Notice that in the tutorials, you get the processed output of the code command. In the examples, they are not executed on the website - so it's worth adding here also the output.

>>> rf = RandomForestClassifier(n_estimators=200, random_state=0)
>>> ipw(
... sample_df,
... sample_weights,
... target_df,
... target_weights,
... variables=["gender", "age_group", "income"],
... model=rf,
... )

>>> ipw(
... sample_df,
... sample_weights,
... target_df,
... target_weights,
... variables=["gender", "age_group", "income"],
... model="sklearn",
... logistic_regression_kwargs={"max_iter": 2000},
... )

Raises:
Exception: f"Sample indicator only has value {_n_unique}. This can happen when your sample or target are empty from unknown reason"
NotImplementedError: if model is not "sklearn"
NotImplementedError: If ``model`` is a string other than "sklearn" (the
built-in logistic regression option) or the deprecated "glmnet".
TypeError: If ``model`` is neither a supported string nor an sklearn
classifier exposing ``predict_proba``.
ValueError: If both ``model`` and ``sklearn_model`` are provided.

Returns:
Dict[str, Any]: A dictionary includes:
Expand All @@ -510,11 +526,31 @@ def ipw(
},
}
"""
if model == "glmnet":
custom_model: ClassifierMixin | None = None
model_name: str | None

if isinstance(model, ClassifierMixin):
custom_model = model
model_name = "sklearn"
elif model is None:
model_name = "sklearn"
elif isinstance(model, str):
model_name = model
else:
raise TypeError(
"model must be 'sklearn', an sklearn classifier implementing predict_proba, or None"
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error message mentions None as a valid option, but None is effectively treated as 'sklearn'. Consider clarifying: 'model must be "sklearn" (string), an sklearn classifier implementing predict_proba, or None (defaults to logistic regression)'

Suggested change
"model must be 'sklearn', an sklearn classifier implementing predict_proba, or None"
"model must be 'sklearn' (string), an sklearn classifier implementing predict_proba, or None (defaults to logistic regression)"

Copilot uses AI. Check for mistakes.
)

if sklearn_model is not None:
if custom_model is not None:
raise ValueError("Provide either 'model' or 'sklearn_model', not both.")
custom_model = sklearn_model

if model_name == "glmnet":
raise NotImplementedError("glmnet is no longer supported")
elif model != "sklearn":
elif model_name != "sklearn":
raise NotImplementedError(
f"Model '{model}' is not supported. Only 'sklearn' is currently implemented."
f"Model '{model_name}' is not supported. Only 'sklearn' is currently implemented."
)

logger.info("Starting ipw function")
Expand Down Expand Up @@ -618,7 +654,7 @@ def ipw(
model_weights,
)

using_default_logistic = sklearn_model is None
using_default_logistic = custom_model is None

if using_default_logistic:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to keep this as a variable. Just use
custom_model is None
And add as a comment # using_default_logistic

# Standardize columns of the X matrix and penalize the columns of the X matrix according to the penalty_factor.
Expand Down Expand Up @@ -705,17 +741,15 @@ def ipw(
else:
if logistic_regression_kwargs is not None:
raise ValueError(
"logistic_regression_kwargs cannot be used when providing a custom sklearn_model"
"logistic_regression_kwargs cannot be used when providing a custom model"
)
if penalty_factor is not None:
logger.warning(
"penalty_factor is ignored when using a custom sklearn_model."
)
logger.warning("penalty_factor is ignored when using a custom model.")
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The warning message should clarify that penalty_factor is only supported for the default logistic regression. Consider: 'penalty_factor is only supported with the default logistic regression model and will be ignored when using a custom classifier.'

Suggested change
logger.warning("penalty_factor is ignored when using a custom model.")
logger.warning("penalty_factor is only supported with the default logistic regression model and will be ignored when using a custom classifier.")

Copilot uses AI. Check for mistakes.

custom_model = clone(cast(ClassifierMixin, sklearn_model))
custom_model = clone(cast(ClassifierMixin, custom_model))
if not hasattr(custom_model, "predict_proba"):
raise ValueError(
"The provided sklearn_model must implement predict_proba for propensity estimation."
"The provided custom model must implement predict_proba for propensity estimation."
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error message could be more actionable. Consider: 'The provided custom model must implement the predict_proba method for propensity estimation. Ensure your classifier inherits from sklearn.base.ClassifierMixin and defines predict_proba.'

Suggested change
"The provided custom model must implement predict_proba for propensity estimation."
"The provided custom model must implement the predict_proba method for propensity estimation. "
"Ensure your classifier inherits from sklearn.base.ClassifierMixin and defines predict_proba."

Copilot uses AI. Check for mistakes.
)

X_matrix = _convert_to_dense_array(X_matrix)
Expand All @@ -732,13 +766,13 @@ def ipw(
probas = model.predict_proba(X_matrix)
if probas.ndim != 2 or probas.shape[1] < 2:
raise ValueError(
"The provided sklearn_model.predict_proba must return probability estimates for both classes."
"The provided custom model predict_proba must return probability estimates for both classes."
)
try:
class_index = list(model.classes_).index(1)
except ValueError as error:
raise ValueError(
"The provided sklearn_model must be trained on the binary labels {0, 1}."
"The provided custom model must be trained on the binary labels {0, 1}."
) from error
pred = probas[:, class_index]
dev[0] = _compute_deviance(y, pred, model_weights)
Expand Down
83 changes: 83 additions & 0 deletions tests/test_ipw.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,31 @@ def test_ipw_supports_custom_sklearn_model(self) -> None:
self.assertLessEqual(prop_dev, 1.0)
self.assertIsNotNone(result["model"]["regularisation_perf"])

def test_ipw_supports_custom_model_parameter(self) -> None:
"""The ``model`` parameter accepts sklearn classifiers directly."""
Comment on lines +262 to +263
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test duplicates the coverage already provided by test_ipw_supports_custom_sklearn_model. Consider removing this test or expanding it to verify distinct behavior not covered by the existing test, such as testing with a different classifier or edge case.

Copilot uses AI. Check for mistakes.

rng = np.random.RandomState(11)
sample = pd.DataFrame({"a": rng.normal(size=25), "b": rng.binomial(1, 0.3, 25)})
target = pd.DataFrame({"a": rng.normal(size=40), "b": rng.binomial(1, 0.6, 40)})

classifier = RandomForestClassifier(
n_estimators=10, max_depth=2, random_state=4
)
result = balance_ipw.ipw(
sample_df=sample,
sample_weights=pd.Series(np.ones(len(sample))),
target_df=target,
target_weights=pd.Series(np.ones(len(target))),
model=classifier,
transformations=None,
num_lambdas=1,
max_de=1.5,
)

self.assertIsInstance(result["model"]["fit"], RandomForestClassifier)
self.assertTrue(np.isnan(result["model"]["lambda"]))
self.assertEqual(len(result["weight"]), len(sample))

def test_ipw_supports_dense_only_estimators(self) -> None:
"""Estimators that require dense matrices (e.g., GaussianNB) are supported."""

Expand Down Expand Up @@ -344,6 +369,29 @@ def predict_proba(self, X): # type: ignore[override]
num_lambdas=1,
)

def test_ipw_rejects_custom_models_without_binary_classes(self) -> None:
"""Custom models must be trained on labels containing both 0 and 1."""

class ShiftedProbabilityModel(LogisticRegression):
def fit(self, X, y, sample_weight=None): # type: ignore[override]
super().fit(X, y + 2, sample_weight=sample_weight)
return self

rng = np.random.RandomState(12)
sample = pd.DataFrame({"a": rng.normal(size=30)})
target = pd.DataFrame({"a": rng.normal(size=40)})

with self.assertRaisesRegex(ValueError, "must be trained on the binary labels"):
balance_ipw.ipw(
sample_df=sample,
sample_weights=pd.Series(np.ones(len(sample))),
target_df=target,
target_weights=pd.Series(np.ones(len(target))),
model=ShiftedProbabilityModel(max_iter=50),
transformations=None,
num_lambdas=1,
)

def test_ipw_rejects_logistic_kwargs_with_custom_model(self) -> None:
"""Providing logistic_regression_kwargs with custom model raises an error."""

Expand Down Expand Up @@ -386,6 +434,41 @@ def test_ipw_warns_when_penalty_factor_with_custom_model(self) -> None:
any("penalty_factor is ignored" in message for message in logs.output)
)

def test_ipw_rejects_conflicting_model_arguments(self) -> None:
"""Supplying both model and sklearn_model triggers a clear error."""

sample = pd.DataFrame({"a": (0, 1, 1, 0)})
target = pd.DataFrame({"a": (1, 0, 0, 1)})

with self.assertRaisesRegex(ValueError, "either 'model' or 'sklearn_model'"):
balance_ipw.ipw(
sample_df=sample,
sample_weights=pd.Series((1,) * len(sample)),
target_df=target,
target_weights=pd.Series((1,) * len(target)),
model=LogisticRegression(),
sklearn_model=LogisticRegression(),
transformations=None,
num_lambdas=1,
)

def test_ipw_rejects_unknown_model_identifier(self) -> None:
"""Non-supported model identifiers raise NotImplementedError."""

sample = pd.DataFrame({"a": (0, 1)})
target = pd.DataFrame({"a": (1, 0)})

with self.assertRaises(NotImplementedError):
balance_ipw.ipw(
sample_df=sample,
sample_weights=pd.Series((1,) * len(sample)),
target_df=target,
target_weights=pd.Series((1,) * len(target)),
model="unsupported-model",
transformations=None,
num_lambdas=1,
)

def test_model_coefs_handles_linear_and_non_linear_estimators(self) -> None:
"""model_coefs returns coefficients for linear models and empty series otherwise."""

Expand Down
Loading