diff --git a/.gitignore b/.gitignore index 8b3032d..aca1c60 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,9 @@ docs/_build # Virtualenvs env/ venv/ + +# VS code +.vscode + +# Notebooks +notebooks/** \ No newline at end of file diff --git a/lorax/lorax.py b/lorax/lorax.py index bebda9e..8e969b8 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -2,6 +2,7 @@ import re import logging import pandas as pd +import numpy as np from math import sqrt from scipy import stats @@ -33,24 +34,340 @@ class TheLorax(object): with other types of models and problems (regression/multinomial classification) Args: - clf (sklearn classifier) The classifier to be explained, e.g., + clf (sklearn classifier): The classifier to be explained, e.g., sklearn.ensemble.RandomForestClassifier - test_mat (pandas.DataFrame) The test matrix containing all examples to be - explained. If `id_col=None` (the default), the id for referencing entities - must be set as this dataframe's index. - id_col (str) The column name for the entity id in the test matrix. If `None` - (the default), the test matrix must be indexed by the entity id. - date_col (str) The date column in the matrix (default: `as_of_date`) - outcome_col (str) The outcome column in the matrix (default: `outcome`). To - indicate that the test matrix has no labels, set `outcome_col=None`. - name_patterns (list) An optional list of regex patterns or compiled regex + The classifier should be already trained. + + column_names (List(str)): The input feature names of the data for which the classifier was trained + + column_patterns (list): An optional list of regex patterns or compiled regex objects to group together features for reporting contributions. If using, each feature name in the test matrix must match one and only one pattern. + + test_mat (pandas.DataFrame): A test matrix to be pre-loaded to the object. This dataset will be used for providing feature distributions to the individual + If `id_col=None` (the default), the id for referencing entities + must be set as this dataframe's index. + + id_col (str): The column name for the entity id in the test matrix. If `None` + (the default), the test matrix must be indexed by the entity id. + + date_col (str): The date column in the matrix (default: None). + If None, the matrix shouldn't contain a column with date + + outcome_col (str): The outcome column in the matrix (default: None). + To indicate that the test matrix has no labels, set `outcome_col=None`. + """ - def __init__(self, clf, test_mat, id_col=None, + def __init__(self, clf, column_names, + column_patterns=None, + test_mat=None, + id_col=None, + date_col=None, outcome_col=None): + + self.clf = clf + + # NOTE: Minor. maybe this should be feature_names and feature_patterns + # To separate from the index and the outcome + self.column_names = column_names + self.column_patterns = column_patterns + + # Register the regex patterns and associated columns if using + if column_patterns is not None: + self.set_name_patterns(column_patterns) + + # NOTE-KA: I feel like the method should be independent of these as these seem very triage specific. + # We can always have a script that bridges the triage data with the explain API + # Leaving this decoupling to another PR + self.id_col = id_col + self.date_col = date_col + self.outcome_col = outcome_col + + self.combined_index = False + if id_col is not None: + if type(id_col) in [list, tuple]: + self.combined_index = True + + # TODO: These should be moved out from the constructor. + # Current version of the code depends on their existence + self.X_test = None + self.y_test = None + self.preds = None + self.drop_cols = list() + self.feature_stats = None + + if test_mat is not None: + self.load_dataset(test_mat=test_mat, id_col=id_col, date_col=date_col, outcome_col=outcome_col) + + # When populated, this will contain the component information of the model + self.model_info = dict() + + def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outcome_col=None): + """ Loading a test dataset to the object. + This dataset can be used to suppliment individual feature importances with feature distribution stats + + Args: + test_mat: A pandas dataframe containing the dataset + id_col: The name(s) of the columns to uniquely identify an instance (entity_id in triage land) + date_col: Name of the column that has the date information + outcome_col: If the dataframe contains the target output, the column name of the target + + return: + None + """ + + df = test_mat.copy() + + if id_col is not None: + df.set_index(id_col, inplace=True) + # TODO: minor, check whether this is the ideal way of doing this + if type(id_col) in [list, tuple]: + self.combined_index = True + + # exclude non-feature columns (date, outcome if present) + if (date_col is not None) and (date_col not in id_col): + self.drop_cols.append(date_col) + + if outcome_col is not None: + self.drop_cols.append(outcome_col) + + self.y_test = df[outcome_col] + + self.X_test = df.drop(self.drop_cols, axis=1) + + # Setting the predictions of the test dataset + self.preds = pd.DataFrame( + {'pred': [p[1] for p in self.clf.predict_proba(self.X_test.values)]}, + index=self.X_test.index + ) + + # For classifiers with intercepts, we add the intercept as a "feature" + if hasattr(self.clf, 'intercept_'): + self.X_test["Intercept"] = [self.clf.intercept_[0] for i in range(len(self.X_test))] + + # Making sure the column names match the test matrices columns + self.column_names = self.X_test.columns.values + + # pre-calculating the feature distributions + self.feature_stats = self.populate_feature_stats(test_mat=self.X_test) + + def explain_example(self, + sample=None, + descriptive=False, + test_mat=None, + idx=None, + pred_class=None, + graph=False, + num_features=10, + how='features'): + """ + Graph or return individual feature importances for an example. + + This method is the primary interface for TheLorax to calculate individual feature + importances for a given example. The function can be used to either + return a pandas DataFrame with contributions and feature distributions (if + `graph=False`) or a graphical representation of the top `num_features` contributions + (if `graph=True`) for use in a jupyter notebook. + + Feature contributions can be calucalted either for all features separately (`how='features'`, + the default) or using regular expression patterns to group sets of features together + (`how='patterns'`). When graphing contributions for all features, graphs will contain two + components: + 1. A bar graph of the top num_features contributions to the example's score + 2. For each of these features, a graph showing the percentile for the feature's mean + across the entire test set (gray dot), the percentile of the feature value for the + example being explained (orange dot) and the z-score for that value + When using regular expression patterns, the feature distribution information is omitted + (from both graphical and dataframe outputs) as the contributions reflect aggregations over + an arbitrary number and types of features. + + Args: + sample (array): The instance that is to be explained. If this is None, a test matrix and a sample index should be provided + + descriptive (bool): Whether to accompany the explanations with feature distribution data of a test dataset. + Gives more context to the feature important scores. + To be used, a test dataset should be availabe to the function through `test_mat` or `load_dataset()`. + If not set, only the individual feature importance scores will be returned to the user. + + test_mat (pd.DataFrame): A test dataset to be used for descriptive explanations. If provided, this will override any dataset preloaded using `load_dataset()` + + idx (int): The index---w.r.t to the test dataset provided through `test_mat` or `load_dataset()`---of the example we want to explain. + If both, a sample and an index are provided, sample will be ignored + + pred_class (int): The predicted class for the example (currently must be 1 or 0). The + returned feature contributions will be taken relative to the score for this class. + If None (the default), the predicted class will be assigned based on whether the + example's score is above or below a threshold of 0.5. + + graph (bool): Whether to graph the feature contributions or return a dataframe + without graphing (default: False) + + num_features (int): The number of features with the highest contributions to graph + (ignored if `graph=False` in which case the entire set will be returned) + + how (str): Whether to calculate feature contributions at the level of individual features + (`how='features'`, the default) or using regex patterns (`how='patterns'`). + If using regex patterns, `name_patterns` must have been provided when the object + as constructed or through calling `set_name_patterns()`. + + Returns: + If `graph=False`, returns a pandas dataframe with individual feature contributions + and (if using `how='features'`) feature distribution information + + """ + # TODO: Categoricals can be handled using regex patterns, but this currently precludes + # showing feature distribution information (since we don't know how to combine distributions + # for arbitary feature groupings), but if just using patterns for categoricals/imputed flags + # we should still be able to show relevant distribution info... + + # User has to pass either an index and a test_mat or a samples (a row) + if sample is None and ((test_mat is None and self.X_test is None) or idx is None): + raise ValueError('Must either provide a data sample or a test matrix with a sample index') + + # A test matrix is necessary for getting descriptive stats + if descriptive and (test_mat is None and self.X_test is None): + raise ValueError('Sould provide a test dataset or should have preloaded a test dataset') + + if how == 'patterns' and self.column_patterns is None: + raise ValueError('Must specify name patterns to aggregate over.' + + 'Use TheLorax.set_name_patterns() first.') + elif how not in ['features', 'patterns']: + # NOTE-KA: Minor, in this case, should we default to features and let the code run with a warning? + raise ValueError('How must be one of features or patterns.') + + # TODO: Add error handling for sample's features and the data features. + if isinstance(sample, pd.Series): + sample = sample.values + + if self.X_test is not None and idx is not None: + sample = self.X_test.loc[idx] + + # When creating X_test in load data, + # if the classifier contains a Intercept, it is added as a feature. + # Have to remove it before the inference phase + if 'Intercept' in sample.index.values: + sample = sample.drop(['Intercept']) + + sample = sample.values + + # Formatting the test data matrix by setting appropriate index and removing non-feature coulmns + if test_mat is not None: + # Indexing and exclusing non-feature columns + # NOTE-KA: I think this should be handled outside of Lorax + if self.id_col is not None: + test_mat.set_index(self.id_col, inplace=True) + + for dr_col in self.drop_cols: + # Dropping the non-feature columns in the new test matrix, if they exist + # TODO: Handle the ID cols, Date cols elegantly + test_mat = test_mat.drop(dr_col, axis=1, errors='ignore') + + if idx is not None: + sample = test_mat.loc[idx].values + + # Calculating Feature contributions + if isinstance(self.clf, RandomForestClassifier): + # Getting values for Random Forest Classifier + return_tuple = get_contrib_list_RF(self.clf, sample, self.column_names) + + # Feature importance scores + contrib_list = return_tuple[4] + + # Model component information + self.model_info['num_trees'] = return_tuple[0] + self.model_info['global_score_dict'] = return_tuple[1] + self.model_info['feature_dict'] = return_tuple[2] + self.model_info['aggregated_dict'] = return_tuple[3] + + elif isinstance(self.clf, LogisticRegression): + # Getting values for Random Forest Classifier + # TODO: The column names need to be consolidated + + contrib_list = get_contrib_list_LR(self.clf, sample, self.column_names) + + # Setting the prediction class + # predict proba returns a 2D array + scores = self.clf.predict_proba(sample.reshape(1, -1))[0] + + if pred_class is None: + # TODO: Multiclass adpatation + # use np.argmax(), or clf.predict() + pred_class = np.argmax(scores) + + # Prediction score for the given pred_class + score = scores[pred_class] + + # TODO: handle this more elegantly for multiclass problems + # We need to flip the sign of the scores. + if pred_class == 0: + score = 1.0 - score + contrib_list = [(feature, score * -1) for feature, score in contrib_list] + + # TODO: Need to be modified to not taking the index + # Replacing the example id with -1 for now + logging.info('Used predicted class {} for example {}, score={}'.format(pred_class, + -1, + score)) + + # sorting in descending order by contribution then by feature name in the case of ties + contrib_list.sort(key=lambda x: (x[1] * -1, x[0])) + + # TODO: If descriptive is set, the importance scores + # are supported with the context provided by a test dataset + # The code is available in the original constructor, move it here + if descriptive: + # If descriptive, it rolls back to the original case + if test_mat is None: + test_mat = self.X_test + fstats = self.feature_stats + else: + fstats = self.populate_feature_stats(test_mat) + + contrib_df = self._build_contrib_df(contrib_list, + test_mat=test_mat, + idx=idx, + sample=sample, + feature_stats=fstats, + how=how) + + # adding overall feature importance from model level + overall_importance = [] + for i, cname in enumerate(self.column_names): + if isinstance(self.clf, LogisticRegression): + overall_importance.append((cname, self.clf.coef_[0][i])) + + elif isinstance(self.clf, RandomForestClassifier): + overall_importance.append((cname, self.clf.feature_importances_[i])) + + else: + pass + + updated_list = add_overall_feature_importance(contrib_list, overall_importance) + updated_columns = ['feature', 'sample_rank', 'overall_imp', 'overall_rank', 'rank_change'] + contrib_df = contrib_df.join( + pd.DataFrame( + data=updated_list, + columns=updated_columns + ).set_index('feature') + ) + + if graph: + self._plot_graph(idx, + pred_class, + score, + num_features, + contrib_df, + how) + + else: + contrib_df = self._build_contrib_df_sample(contrib_list, how=how) + + return contrib_df + + def old_init(self, clf, test_mat, id_col=None, date_col='as_of_date', outcome_col='outcome', name_patterns=None): + # TODO: This method should be removed after verifying that the new init compares """ Initialize Lorax. @@ -102,18 +419,28 @@ def __init__(self, clf, test_mat, id_col=None, self.X_test["Intercept"] = [self.clf.intercept_[0] for i in range(len(self.X_test))] # pre-calcuate feature distribution statistics for each feature - self._populate_feature_stats() + self.feature_stats = self.populate_feature_stats(test_mat=self.X_test) + + # TODO: make protected again. Making public for testing + def populate_feature_stats(self, test_mat): + """ + Pre-calculates the feature distribution information from a test matrix, including + type (continuous or binary), mean, median, 5th & 95th percentiles, standard deviation. - def _populate_feature_stats(self): - """Setter function for feature distribution statistics. + Args: + test_mat (pandas.DataFrame): THe test matrix in a dataframe form - Pre-calculates the feature distribution information from the test matrix, including - type (continuous or binary), mean, median, 5th & 95th percentiles, standard deviation. + return: + A dataframe indexed by features containing feature distribution information """ + # TODO: Modified to take in a test matrix, I think the function name should change + fstats = pd.DataFrame(columns=['feature', 'type', 'mean', 'stdev', 'median', 'p5', 'p95']) - dtypes = self.X_test.dtypes + dtypes = test_mat.dtypes + + # TODO: can vectorize? for col in self.column_names: - feat = self.X_test[col] + feat = test_mat[col] d = {'feature': col, 'mean': feat.mean(), 'median': feat.median(), @@ -135,7 +462,8 @@ def _populate_feature_stats(self): fstats = fstats.append(d, ignore_index=True) fstats.set_index('feature', inplace=True) - self.feature_stats = fstats + # self.feature_stats = fstats + return fstats def set_name_patterns(self, name_patterns): """Map regex patterns to column names. @@ -151,7 +479,7 @@ def set_name_patterns(self, name_patterns): regex mapping associated with the object). Arguments: - name_patters (list) A list of regex patterns or compiled regex objects to + name_patterns (list) A list of regex patterns or compiled regex objects to group together features for reporting contributions. If using, each feature name in the test matrix must match one and only one pattern. """ @@ -216,13 +544,31 @@ def _plot_graph(self, idx, pred_class, score, plt.show() - def _build_contrib_df(self, mean_by_trees_list, idx, how): + def _build_contrib_df_sample(self, mean_by_trees_list, how): + contrib_df = pd.DataFrame(mean_by_trees_list, + columns=['feature', 'contribution']) + contrib_df.set_index('feature', inplace=True) + + # if we're using name patterns, aggregate columns to pattern level, + # otherwise, join on column-level statistics (not available for pattern-level) + if how == 'patterns': + contrib_df = contrib_df.join(self.column_patterns, how='inner') + contrib_df = contrib_df.groupby(['name_pattern'])['contribution'].sum().to_frame() + + # sort the resulting dataframe in descending order by contribution + contrib_df.sort_values('contribution', ascending=False, inplace=True) + + return contrib_df + + def _build_contrib_df(self, mean_by_trees_list, test_mat, idx, sample, feature_stats, how='features'): """ Build contribution dataframe. In: - mean_by_trees_list (list): + - test_mat: The reference test matrix, a dataframe - idx: index for example + - sample: the row matrix of the sample. Either a idx or sample should be provided - how: Whether to calculate feature contributions at the level of individual features Out: @@ -238,18 +584,23 @@ def _build_contrib_df(self, mean_by_trees_list, idx, how): contrib_df = contrib_df.join(self.column_patterns, how='inner') contrib_df = contrib_df.groupby(['name_pattern'])['contribution'].sum().to_frame() else: - contrib_df = contrib_df.join(self.feature_stats, how='left') + contrib_df = contrib_df.join(feature_stats, how='left') # lookup the specific example's values - for col in contrib_df.index.values: - + for i, col in enumerate(contrib_df.index.values): if self.combined_index: - example_value = self.X_test.loc[idx, col].values[0] + if idx is not None: + example_value = test_mat.loc[idx, col].values[0] + else: + example_value = sample[i] else: - example_value = self.X_test.loc[idx, col] + if idx is not None: + example_value = test_mat.loc[idx, col] + else: + example_value = sample[i] contrib_df.loc[col, 'example_value'] = example_value - vals, pct_sco = self.X_test[col], example_value + vals, pct_sco = test_mat[col], example_value contrib_df.loc[col, 'example_pctl'] = stats.percentileofscore(vals, pct_sco) / 100.0 contrib_df['z_score'] = 1.0 * (contrib_df['example_value'] - contrib_df['mean']) @@ -378,52 +729,7 @@ def _plot_dists(self, df, num_features, ax): ax.set_facecolor('white') ax.set_title('Feature Distributions', fontsize=16) - def explain_example(self, idx, pred_class=None, num_features=10, graph=True, how='features'): - """Graph or return individual feature importances for an example. - - This method is the primary interface for TheLorax to calculate individual feature - importances for a given example (identified by `idx`). It can be used to either - return a pandas DataFrame with contributions and feature distributions (if - `graph=False`) or a graphical representation of the top `num_features` contributions - (if `graph=True`, the default) for use in a jupyter notebook. - - Feature contributions can be calucalted either for all features separately (`how='features', - the default) or using regular expression patterns to group sets of features together - (`how='patterns'`). When graphing contributions for all features, graphs will contain two - components: - 1. A bar graph of the top num_features contributions to the example's score - 2. For each of these features, a graph showing the percentile for the feature's mean - across the entire test set (gray dot), the percentile of the feature value for the - example being explained (orange dot) and the z-score for that value - When using regular expression patterns, the feature distribution information is omitted - (from both graphical and dataframe outputs) as the contributions reflect aggregations over - an arbitrary number and types of features. - - Arguments: - idx (int) The entity id of the example we want to explain - pred_class (int) The predicted class for the example (currently must be 1 or 0). The - returned feature contributions will be taken relative to the score for this class. - If None (the default), the predicted class will be assigned based on whether the - example's score is above or below a threshold of 0.5. - num_features (int) The number of features with the highest contributions to graph - (ignored if `graph=False` in which case the entire set will be returned) - graph (bool) Whether to graph the feature contributions or return a dataframe - without graphing (default: True) - how (str) Whether to calculate feature contributions at the level of individual features - (`how='features'`, the default) or using regex patterns (`how='patterns'`). - If using regex patterns, `name_patterns` must have been provided when the object - was constructed or through calling `set_name_patterns()`. - - Returns: - If `graph=False`, returns a pandas dataframe with individual feature contributions - and (if using `how='features'`) feature distribution information - - """ - # TODO: Categoricals can be handled using regex patterns, but this currently precludes - # showing feature distribution information (since we don't know how to combine distributions - # for arbitary feature groupings), but if just using patterns for categoricals/imputed flags - # we should still be able to show relevant distribution info... - + def explain_example_old(self, idx, pred_class=None, num_features=10, graph=True, how='features'): if how == 'patterns' and self.column_patterns is None: raise ValueError('Must specify name patterns to aggregate over.' + 'Use TheLorax.set_name_patterns() first.') @@ -474,7 +780,12 @@ def explain_example(self, idx, pred_class=None, num_features=10, graph=True, how contrib_list.sort(key=lambda x: (x[1] * -1, x[0])) # drop the results into a dataframe to append on other information - contrib_df = self._build_contrib_df(contrib_list, idx, how) + contrib_df = self._build_contrib_df( + contrib_list, test_mat=self.X_test, + sample=sample, + feature_stats=self.feature_stats, + idx=idx, how=how + ) # adding overall feature importance from model level overall_importance = [] @@ -503,4 +814,5 @@ def speak_for_the_trees(self, id, pred_class=None, num_features=20, graph=True, This method is just a synonym for `explain_example()` because TheLorax has to be able to speak for the trees. """ + # TODO: Make sure this is adapted to the new method return self.explain_example(id, pred_class, num_features, graph, how) diff --git a/tests/test_lorax.py b/tests/test_lorax.py index 85eca86..5e2ea53 100644 --- a/tests/test_lorax.py +++ b/tests/test_lorax.py @@ -1,4 +1,9 @@ """Tests for Lorax.""" +# TODO: Figure out how to do this optimally +import os +import sys +project_path = os.path.join(os.path.dirname(__file__), '../') +sys.path.append(project_path) import random import unittest @@ -10,7 +15,7 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression -from lorax import TheLorax +from lorax.lorax import TheLorax from lorax.utils import add_overall_feature_importance random.seed(42) @@ -48,7 +53,13 @@ class TestLorax(unittest.TestCase): def test_calculated_feature_importances(self): """Test calculated feature importances.""" # Setting up lorax - lrx = TheLorax(global_clf, data, id_col='entity_id') + lrx = TheLorax(clf=global_clf, + column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + lrx_out = lrx.explain_example(idx=1, pred_class=1, graph=False) feature1_contrib = lrx_out.contribution.loc['feature1'] @@ -60,7 +71,11 @@ def test_calculated_feature_importances(self): self.assertFalse('feature3' in lrx_out.contribution) def test_aggregated_dict(self): - """Test aggregated_dict.""" + """ + Test aggregated_dict. In the modified version the + aggregated dict is an element of a dictionary named model_info + + """ n_estimators = 5 max_depth = 1 @@ -71,14 +86,21 @@ def test_aggregated_dict(self): clf = clf.fit(X, y) # Setting up lorax - lrx = TheLorax(clf, data, id_col='entity_id') + lrx = TheLorax(clf, + column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + _ = lrx.explain_example(idx=1, pred_class=1, graph=False) # Max depth is 1. Number of split_occurences must be equal to # occurences_in_n_trees. - for feature in lrx.aggregated_dict: - split_occ = lrx.aggregated_dict[feature]['diff_list']['split_occurences'] - occ_trees = lrx.aggregated_dict[feature]['mean_diff_list']['occurences_in_n_trees'] + aggregated_dict = lrx.model_info['aggregated_dict'] + for feature in aggregated_dict: + split_occ = aggregated_dict[feature]['diff_list']['split_occurences'] + occ_trees = aggregated_dict[feature]['mean_diff_list']['occurences_in_n_trees'] self.assertEqual(split_occ, occ_trees) def test_logistic_regression_importances(self): @@ -88,15 +110,27 @@ def test_logistic_regression_importances(self): clf.fit(X, y) # Setting up lorax - lrx = TheLorax(clf, data, id_col='entity_id') + lrx = TheLorax(clf, + column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + lrx_out = lrx.explain_example(idx=1, pred_class=1, graph=False) feature1_contrib = lrx_out.contribution.loc['feature1'] feature5_contrib = lrx_out.contribution.loc['feature5'] + feature1_contrib = round(feature1_contrib, 5) + feature5_contrib = round(feature5_contrib, 5) + true_feature1_contrib = round(2.186415806126551, 5) + true_feature5_contrib = round(-3.228614405467005, 5) + + # Test cases for correct feature importances - self.assertEqual(feature1_contrib, 2.186415806126551) - self.assertEqual(feature5_contrib, -3.228614405467005) + self.assertEqual(feature1_contrib, true_feature1_contrib) + self.assertEqual(feature5_contrib, true_feature5_contrib) # Test case if we can recover lr prediction # Can't use all of sample because it now contains intercept as last element @@ -107,7 +141,10 @@ def test_logistic_regression_importances(self): self.assertEqual(lrx_pred, lr_pred) def test_size_global_dict(self): - """Test the size of the global dict.""" + """ + Test the size of the global dict. Part of the model_info dictionary + + """ n_estimators = 3 max_depth = 1 @@ -118,21 +155,28 @@ def test_size_global_dict(self): clf = clf.fit(X, y) # Setting up lorax - lrx = TheLorax(clf, data, id_col='entity_id') + lrx = TheLorax(clf, + column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + _ = lrx.explain_example(idx=1, pred_class=1, graph=False) # Checking if there as many entries, i.e., trees in # global_score_dict as number of estimators in forest - self.assertEqual(len(lrx.global_score_dict), n_estimators) + global_score_dict = lrx.model_info['global_score_dict'] + self.assertEqual(len(global_score_dict), n_estimators) # Checking if every dict entry, i.e., tree has max_depth keys # Since max_depth=1, every tree dict should have only one entry for i in range(n_estimators): - self.assertEqual(len(lrx.global_score_dict[i]), 1) + self.assertEqual(len(global_score_dict[i]), 1) # Checking if dicts for only feature in tree do not # have more than one entry - for tree_idx, feat_dict in lrx.global_score_dict.items(): + for tree_idx, feat_dict in global_score_dict.items(): self.assertEqual(len(feat_dict), 1) def test_add_overall_feature_importance(self): @@ -148,8 +192,19 @@ def test_add_overall_feature_importance(self): self.assertTupleEqual(true_result[i], result[i]) # Setting up lorax - lrx = TheLorax(global_clf, data, id_col='entity_id') - lrx_out = lrx.explain_example(idx=1, pred_class=1, graph=False) + lrx = TheLorax(global_clf, + column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + + lrx_out = lrx.explain_example( + idx=1, + pred_class=1, + graph=False, + descriptive=True + ) feature1_overall_imp = global_clf.feature_importances_[0] @@ -161,7 +216,13 @@ def test_multiple_rows_per_entity_id(self): """Test support of multiple rows per entity_id.""" # Setting up lorax # Getting output on test matrix with one row per entity_id - lrx = TheLorax(global_clf, data, id_col='entity_id') + lrx = TheLorax(global_clf, + column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + lrx_out = lrx.explain_example(idx=1, pred_class=1, graph=False) # Changing test matrix so that the second row belongs @@ -171,7 +232,13 @@ def test_multiple_rows_per_entity_id(self): # Checking that the output for original row of entity 1 # remains the same when using combined index - lrx = TheLorax(global_clf, new_data, id_col=['entity_id', 'as_of_date']) + lrx = TheLorax(global_clf, + column_names=data.columns.values, + test_mat=new_data, + id_col=['entity_id', 'as_of_date'], + date_col='as_of_date', + outcome_col='outcome') + out_multi_rows = lrx.explain_example(idx=(1, '2017-08-21 18:01:57.040781'), pred_class=1, graph=False) @@ -189,4 +256,4 @@ def test_multiple_rows_per_entity_id(self): if __name__ == "__main__": - unittest.main(exit=False) + unittest.main(exit=True) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py new file mode 100644 index 0000000..49cfa1d --- /dev/null +++ b/tests/test_new_lorax.py @@ -0,0 +1,212 @@ +import os +import sys +project_path = os.path.join(os.path.dirname(__file__), '../') +sys.path.append(project_path) + +import pandas as pd +import numpy as np +import random +from datetime import datetime +from sklearn import datasets +from sklearn.ensemble import RandomForestClassifier + +from lorax.lorax import TheLorax + +import unittest + +# Data generation for classification +X, y = datasets.make_classification(n_samples=10000, n_features=5, + n_informative=3, n_redundant=2, + random_state=42) + +# Preparing test matrix +start, end = datetime(2017, 1, 1), datetime(2017, 12, 31) +as_of_dates = np.asarray([start + (end - start) * random.random() for i in range(X.shape[0])]) +entity_ids = np.arange(1, X.shape[0] + 1) + +data = np.append(X, y.reshape(y.shape[0], 1), axis=1) +data = np.append(as_of_dates.reshape(y.shape[0], 1), data, axis=1) +data = np.append(entity_ids.reshape(y.shape[0], 1), data, axis=1) + +columns = ["entity_id", "as_of_date", "feature1", "feature2", + "feature3", "feature4", "feature5", "outcome"] + +features = [x for x in columns if x not in ['entity_id', 'as_of_date', 'outcome']] + +data = pd.DataFrame(data, columns=columns) + +# Testing the independence from id_col, date_col, outcome +# data = data.drop(['entity_id', 'as_of_date', 'outcome'], axis=1) + + +n_estimators = 2 +max_depth = 2 +global_clf = RandomForestClassifier(n_estimators=n_estimators, + max_depth=max_depth, + random_state=42).fit(X, y) + + +class TestLorax(unittest.TestCase): + """Tests cases for Lorax.""" + + def test_feature_importances(self): + """Test calculated feature importances.""" + # Setting up lorax + lrx = TheLorax(clf=global_clf, + column_names=features, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + + # without id_col (zero indexed) + # lrx_out = lrx.explain_example_new(test_mat=data, idx=0, pred_class=1, graph=False) + + sample = data.loc[0, features].values + + pred_class = 0 # The label w.r.t the explanations are generated + lrx_out = lrx.explain_example(sample=sample, + test_mat=None, + descriptive=True, + idx=None, + pred_class=pred_class, + num_features=10, + graph=False + ) + + feature1_contrib = lrx_out.contribution.loc['feature1'] + feature5_contrib = lrx_out.contribution.loc['feature5'] + + print('Asserting feature importance scores...') + + # Test cases for correct feature importances + if pred_class == 1: + self.assertEqual(feature1_contrib, 0.04889021376498209) + self.assertEqual(feature5_contrib, -0.31556073962118303) + else: + self.assertEqual(feature1_contrib, -0.04889021376498209) + self.assertEqual(feature5_contrib, 0.31556073962118303) + + self.assertFalse('feature3' in lrx_out.contribution) + + def test_feature_stats(self): + """Testing the data loader""" + + lrx = TheLorax(clf=global_clf, + column_names=features, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + + st1 = lrx.populate_feature_stats(data[features]) + + pd.testing.assert_frame_equal(st1, lrx.feature_stats) + + def test_descriptive_explanation_cases(self): + """ + There are different methods to get a descriptive explanation + This test asserts all those methods yield the same answer + """ + pass + + def test_old_vs_new_lorax(self): + """ + Verifying that the new explain method is + generating the same explanations as before + + Note: This test was deprecated after verufying that the new explain instance + returned the same results as the old one. + The old method was emoved from the class + """ + lrx = TheLorax(clf=global_clf, + column_names=features, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + + pred_class = 0 # The label w.r.t the explanations are generated + idx = 2 + lrx_out_new = lrx.explain_example(sample=None, + test_mat=None, + descriptive=True, + idx=idx, + pred_class=pred_class, + num_features=10, + graph=False) + + lrx_out_old = lrx.explain_example_old(idx=idx, + pred_class=pred_class, + num_features=10, + graph=False, + how='features') + + pd.testing.assert_frame_equal(lrx_out_new, lrx_out_old) + + def test_explanation_patterns(self): + """ + Testing whether the explanations interms of + feature patterns are generated correctly + """ + + # Creating the data with cateorical columns to have regex feature patterns + X2, y2 = datasets.make_classification(n_samples=10000, + n_features=6, + n_informative=3, + n_redundant=2, + random_state=42) + + data2 = np.append(X2, y2.reshape(y2.shape[0], 1), axis=1) + columns2 = ["feature1", "feature2", + "feature3", "feature4", "feature5", + "category", "outcome"] + data2 = pd.DataFrame(data2, columns=columns2) + + # Creating the categorical features + data2['category'] = pd.cut(data2['category'], bins=2, labels=['a', 'b']) + + data2 = pd.get_dummies(data2, columns=['category']) + + features2 = [x for x in data2.columns.values if x not in ['entity_id', 'as_of_date', 'outcome']] + + n_estimators = 10 + max_depth = 4 + clf = RandomForestClassifier(n_estimators=n_estimators, + max_depth=max_depth, + random_state=42).fit(data2[features2].values, y2) + + lrx = TheLorax(clf=clf, + column_names=features2, + column_patterns=['feature', 'category_'], + test_mat=data2, + id_col=None, + date_col=None, + outcome_col='outcome') + + idx = 0 + sample = data2.loc[0, features2].values + pred_class = 0 # The label w.r.t the explanations are generated + + lrx_out = lrx.explain_example(sample=None, + test_mat=None, + descriptive=True, + idx=idx, + pred_class=pred_class, + num_features=10, + graph=False, + how='patterns') + + lrx_out_old = lrx.explain_example_old(idx=idx, + pred_class=pred_class, + num_features=10, + graph=False, + how='patterns') + + # Asserting that both methods yield the same answer + pd.testing.assert_frame_equal(lrx_out, lrx_out_old) + + +if __name__ == '__main__': + unittest.main() +