diff --git a/.github/workflows/test-type-lint.yaml b/.github/workflows/test-type-lint.yaml index 51e4241f..cc1ee818 100644 --- a/.github/workflows/test-type-lint.yaml +++ b/.github/workflows/test-type-lint.yaml @@ -48,6 +48,7 @@ jobs: run: | source activate ./ci_env pip install -e .[dev] + pip install scikit-learn lightning # for docs - name: Print installed packages run: | @@ -76,7 +77,7 @@ jobs: sed -i 's/\"auto\"/None/g' README.md # on Mac: sed -i '' 's/cluster: slurm/cluster: null/g' infra/*.md # check readmes - pytest --markdown-docs -m markdown-docs `**/*.md` + pytest --markdown-docs -m markdown-docs . - name: Run basic pylint run: | diff --git a/docs/infra/example_sklearn.py b/docs/infra/example_sklearn.py new file mode 100644 index 00000000..2b78d9f5 --- /dev/null +++ b/docs/infra/example_sklearn.py @@ -0,0 +1,74 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +""" +A minimalist example with sklearn to show how to develop and explore a model with exca. +""" +import typing as tp +import numpy as np +import pydantic +import sys +import exca +from sklearn.datasets import make_regression +from sklearn.model_selection import train_test_split +from sklearn.linear_model import Ridge +from sklearn.metrics import mean_squared_error + + +class Dataset(pydantic.BaseModel): + n_samples: int = 100 + noise: float = 0.1 + random_state: int = 42 + test_size: float = 0.2 + model_config = pydantic.ConfigDict(extra="forbid") + + def get(self) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + # Generate synthetic data + X, y = make_regression( + n_samples=self.n_samples, + noise=self.noise, + random_state=self.random_state + ) + # Split into training and testing datasets + X_train, X_test, y_train, y_test = train_test_split( + X, y, + test_size=self.test_size, + random_state=self.random_state + ) + return X_train, X_test, y_train, y_test + + +class Model(pydantic.BaseModel): + data: Dataset = Dataset() + alpha: float = 1.0 + max_iter: int = 1000 + infra: exca.TaskInfra = exca.TaskInfra(folder='.cache/') + + @infra.apply + def score(self): + # Get data + X_train, X_test, y_train, y_test = self.data.get() + + # Train a Ridge regression model + print('Fit...') + model = Ridge(alpha=self.alpha, max_iter=self.max_iter) + model.fit(X_train, y_train) + + # Evaluate + print('Score...') + y_pred = model.predict(X_test) + mse = mean_squared_error(y_test, y_pred) + return mse + + +if __name__ == "__main__": + # Validate config + config = exca.ConfDict.from_args(sys.argv[1:]) + model = Model(**config) + print(model.infra.config) + + # Score + mse = model.score() + print(mse)