From 9b1e8ca0d6d4a92ae384970f6bc649c3c13cfccf Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 22 Jan 2020 23:46:45 +0000 Subject: [PATCH 01/80] renamed, will be reversed. added a new init and exp example --- lorax/{lorax.py => the_lorax.py} | 74 ++++++++++++++++++++++++++++++-- 1 file changed, 70 insertions(+), 4 deletions(-) rename lorax/{lorax.py => the_lorax.py} (85%) diff --git a/lorax/lorax.py b/lorax/the_lorax.py similarity index 85% rename from lorax/lorax.py rename to lorax/the_lorax.py index bebda9e..1c94060 100644 --- a/lorax/lorax.py +++ b/lorax/the_lorax.py @@ -48,7 +48,73 @@ class TheLorax(object): each feature name in the test matrix must match one and only one pattern. """ - def __init__(self, clf, test_mat, id_col=None, + def __init__(self, clf, column_names, id_col=None, date_col=None): + self.clf = clf + self.column_names = column_names + self.id_col = id_col + self.date_col = date_col + + def explain_example(self, sample, num_features=10, how='features', descriptive=False, test_mat=None, graph=False): + """Graph or return individual feature importances for an example. + + This method is the primary interface for TheLorax to calculate individual feature + importances for a given example. + It can be used to either return a pandas DataFrame with contributions and feature distributions (if + `graph=False`) or a graphical representation of the top `num_features` contributions + (if `graph=True`, the default) for use in a jupyter notebook. + + Feature contributions can be calucalted either for all features separately (`how='features', + the default) or using regular expression patterns to group sets of features together + (`how='patterns'`). When graphing contributions for all features, graphs will contain two + components: + 1. A bar graph of the top num_features contributions to the example's score + 2. For each of these features, a graph showing the percentile for the feature's mean + across the entire test set (gray dot), the percentile of the feature value for the + example being explained (orange dot) and the z-score for that value + When using regular expression patterns, the feature distribution information is omitted + (from both graphical and dataframe outputs) as the contributions reflect aggregations over + an arbitrary number and types of features. + + Arguments: + idx (int) The entity id of the example we want to explain + pred_class (int) The predicted class for the example (currently must be 1 or 0). The + returned feature contributions will be taken relative to the score for this class. + If None (the default), the predicted class will be assigned based on whether the + example's score is above or below a threshold of 0.5. + num_features (int) The number of features with the highest contributions to graph + (ignored if `graph=False` in which case the entire set will be returned) + graph (bool) Whether to graph the feature contributions or return a dataframe + without graphing (default: True) + how (str) Whether to calculate feature contributions at the level of individual features + (`how='features'`, the default) or using regex patterns (`how='patterns'`). + If using regex patterns, `name_patterns` must have been provided when the object + was constructed or through calling `set_name_patterns()`. + + Returns: + If `graph=False`, returns a pandas dataframe with individual feature contributions + and (if using `how='features'`) feature distribution information + + """ + + if isinstance(self.clf, RandomForestClassifier): + # Getting values for Random Forest Classifier + return_tuple = get_contrib_list_RF(self.clf, sample, self.column_names) + + num_trees = return_tuple[0] + global_score_dict = return_tuple[1] + feature_dict = return_tuple[2] + aggregated_dict = return_tuple[3] + contrib_list = return_tuple[4] + + elif isinstance(self.clf, LogisticRegression): + # Getting values for Random Forest Classifier + contrib_list = get_contrib_list_LR(self.clf, sample, self.column_names) + + print(contrib_list) + + + + def old_init(self, clf, test_mat, id_col=None, date_col='as_of_date', outcome_col='outcome', name_patterns=None): """ @@ -103,7 +169,7 @@ def __init__(self, clf, test_mat, id_col=None, # pre-calcuate feature distribution statistics for each feature self._populate_feature_stats() - + def _populate_feature_stats(self): """Setter function for feature distribution statistics. @@ -378,7 +444,7 @@ def _plot_dists(self, df, num_features, ax): ax.set_facecolor('white') ax.set_title('Feature Distributions', fontsize=16) - def explain_example(self, idx, pred_class=None, num_features=10, graph=True, how='features'): + def explain_example_old(self, idx, pred_class=None, num_features=10, graph=True, how='features'): """Graph or return individual feature importances for an example. This method is the primary interface for TheLorax to calculate individual feature @@ -503,4 +569,4 @@ def speak_for_the_trees(self, id, pred_class=None, num_features=20, graph=True, This method is just a synonym for `explain_example()` because TheLorax has to be able to speak for the trees. """ - return self.explain_example(id, pred_class, num_features, graph, how) + return self.explain_example_old(id, pred_class, num_features, graph, how) From 27026560637b2ae674fd46e1dc6d3e19a26a096f Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 22 Jan 2020 23:48:16 +0000 Subject: [PATCH 02/80] renamed lorax file --- lorax/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lorax/__init__.py b/lorax/__init__.py index 6e34b36..1e64be3 100644 --- a/lorax/__init__.py +++ b/lorax/__init__.py @@ -1,3 +1,3 @@ """Main application.""" -from .lorax import TheLorax +from .the_lorax import TheLorax From 6e3c12893e3ed3bf81e7894a9c1abf58b952ce43 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 22 Jan 2020 23:49:45 +0000 Subject: [PATCH 03/80] vscode --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 8b3032d..628881d 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,6 @@ docs/_build # Virtualenvs env/ venv/ + +# VS code +.vscode \ No newline at end of file From 633ed106237930c1f3e098b6a66ccd6afe6f765b Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 22 Jan 2020 23:56:18 +0000 Subject: [PATCH 04/80] revereted the explain_example function name --- lorax/the_lorax.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index 1c94060..d7774d9 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -54,7 +54,7 @@ def __init__(self, clf, column_names, id_col=None, date_col=None): self.id_col = id_col self.date_col = date_col - def explain_example(self, sample, num_features=10, how='features', descriptive=False, test_mat=None, graph=False): + def explain_example_new(self, sample, num_features=10, how='features', descriptive=False, test_mat=None, graph=False): """Graph or return individual feature importances for an example. This method is the primary interface for TheLorax to calculate individual feature @@ -444,7 +444,7 @@ def _plot_dists(self, df, num_features, ax): ax.set_facecolor('white') ax.set_title('Feature Distributions', fontsize=16) - def explain_example_old(self, idx, pred_class=None, num_features=10, graph=True, how='features'): + def explain_example(self, idx, pred_class=None, num_features=10, graph=True, how='features'): """Graph or return individual feature importances for an example. This method is the primary interface for TheLorax to calculate individual feature @@ -569,4 +569,4 @@ def speak_for_the_trees(self, id, pred_class=None, num_features=20, graph=True, This method is just a synonym for `explain_example()` because TheLorax has to be able to speak for the trees. """ - return self.explain_example_old(id, pred_class, num_features, graph, how) + return self.explain_example(id, pred_class, num_features, graph, how) From 939292983e2b7ce2be111cc7121f70bcf990ae17 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 22 Jan 2020 23:57:37 +0000 Subject: [PATCH 05/80] file for testing new lorax --- tests/test_new_lorax.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/test_new_lorax.py diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py new file mode 100644 index 0000000..e69de29 From db9be54d0d0ec83936d45734b35ebaa9fd0e3fba Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 23 Jan 2020 00:25:22 +0000 Subject: [PATCH 06/80] tested exp._exam._new initial --- tests/test_new_lorax.py | 44 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index e69de29..ccd4aef 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -0,0 +1,44 @@ +import os +import sys +project_path = os.path.join(os.path.dirname(__file__), '../') +sys.path.append(project_path) + +import pandas as pd +import numpy as np + +from sklearn.datasets import load_breast_cancer +from sklearn.ensemble import RandomForestClassifier + +from lorax.the_lorax import TheLorax +from lorax.utils import add_overall_feature_importance + + +def test_lorax_breast_cancer(): + data_dict = load_breast_cancer() + X = data_dict['data'] + y = data_dict['target'] + + columns = data_dict['feature_names'] + + data = pd.DataFrame(X, columns=columns) + + # model + n_estimators = 2 + max_depth = 2 + + global_clf = RandomForestClassifier(n_estimators=n_estimators, + max_depth=max_depth, + random_state=42).fit(X, y) + + # Lorax + lrx = TheLorax( + clf=global_clf, + column_names=columns, + id_col=None, date_col=None) + + sample = X[0, :] + + lrx.explain_example_new(sample=sample) + +if __name__ == '__main__': + test_lorax_breast_cancer() \ No newline at end of file From edae5ffebaf7984b5166691ad634a4e6f48a87bc Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 23 Jan 2020 20:51:28 +0000 Subject: [PATCH 07/80] creates a contribution DF for a sample --- lorax/the_lorax.py | 96 ++++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 41 deletions(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index d7774d9..87dd4cf 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -48,54 +48,32 @@ class TheLorax(object): each feature name in the test matrix must match one and only one pattern. """ - def __init__(self, clf, column_names, id_col=None, date_col=None): + def __init__(self, clf, column_names, column_patterns=None, id_col=None, date_col=None): self.clf = clf self.column_names = column_names + self.column_patterns = column_patterns self.id_col = id_col self.date_col = date_col - def explain_example_new(self, sample, num_features=10, how='features', descriptive=False, test_mat=None, graph=False): - """Graph or return individual feature importances for an example. - - This method is the primary interface for TheLorax to calculate individual feature - importances for a given example. - It can be used to either return a pandas DataFrame with contributions and feature distributions (if - `graph=False`) or a graphical representation of the top `num_features` contributions - (if `graph=True`, the default) for use in a jupyter notebook. - - Feature contributions can be calucalted either for all features separately (`how='features', - the default) or using regular expression patterns to group sets of features together - (`how='patterns'`). When graphing contributions for all features, graphs will contain two - components: - 1. A bar graph of the top num_features contributions to the example's score - 2. For each of these features, a graph showing the percentile for the feature's mean - across the entire test set (gray dot), the percentile of the feature value for the - example being explained (orange dot) and the z-score for that value - When using regular expression patterns, the feature distribution information is omitted - (from both graphical and dataframe outputs) as the contributions reflect aggregations over - an arbitrary number and types of features. - - Arguments: - idx (int) The entity id of the example we want to explain - pred_class (int) The predicted class for the example (currently must be 1 or 0). The - returned feature contributions will be taken relative to the score for this class. - If None (the default), the predicted class will be assigned based on whether the - example's score is above or below a threshold of 0.5. - num_features (int) The number of features with the highest contributions to graph - (ignored if `graph=False` in which case the entire set will be returned) - graph (bool) Whether to graph the feature contributions or return a dataframe - without graphing (default: True) - how (str) Whether to calculate feature contributions at the level of individual features - (`how='features'`, the default) or using regex patterns (`how='patterns'`). - If using regex patterns, `name_patterns` must have been provided when the object - was constructed or through calling `set_name_patterns()`. + def explain_example_new(self, sample=None, + pred_class=None, + num_features=10, + how='features', + descriptive=False, test_mat=None, idx=None, graph=False): - Returns: - If `graph=False`, returns a pandas dataframe with individual feature contributions - and (if using `how='features'`) feature distribution information + # User has to pass either an index and a test_mat or a samples (a row) + if sample is None and test_mat is None and idx is None: + raise ValueError('Must either provide a data sample \ + or a test matrix with a sample index') - """ + if how == 'patterns' and self.column_patterns is None: + raise ValueError('Must specify name patterns to aggregate over.' + + 'Use TheLorax.set_name_patterns() first.') + elif how not in ['features', 'patterns']: + raise ValueError('How must be one of features or patterns.') + + # Calculating Feature contributions if isinstance(self.clf, RandomForestClassifier): # Getting values for Random Forest Classifier return_tuple = get_contrib_list_RF(self.clf, sample, self.column_names) @@ -110,10 +88,35 @@ def explain_example_new(self, sample, num_features=10, how='features', descripti # Getting values for Random Forest Classifier contrib_list = get_contrib_list_LR(self.clf, sample, self.column_names) - print(contrib_list) + # Setting the prediction class + if pred_class is None: + score = self.clf.predict_proba(sample.reshape(1, -1)) + score = score[0][0] + pred_class = int(score >= 0.5) + + # TODO: handle this more elegantly for multiclass problems + # We need to flip the sign of the scores. + if pred_class == 0: + score = 1.0 - score + contrib_list = [(feature, score * -1) for feature, score in contrib_list] + + # TODO: Need to be modified to not taking the index + # Replacing the example id with -1 for now + logging.info('Used predicted class {} for example {}, score={}'.format(pred_class, + -1, + score)) + # sorting in descending order by contribution then by feature name in the case of ties + contrib_list.sort(key=lambda x: (x[1] * -1, x[0])) + self._build_contrib_df_sample(contrib_list, how=how) + # TODO: If descriptive is set, the importance scores + # are supplimented with the context provided by a test dataset + # The code is available in the original constructor, move it here + if descriptive: + pass + def old_init(self, clf, test_mat, id_col=None, date_col='as_of_date', outcome_col='outcome', name_patterns=None): @@ -282,6 +285,17 @@ def _plot_graph(self, idx, pred_class, score, plt.show() + def _build_contrib_df_sample(self, mean_by_trees_list, how): + contrib_df = pd.DataFrame(mean_by_trees_list, + columns=['feature', 'contribution']) + contrib_df.set_index('feature', inplace=True) + + # sort the resulting dataframe in descending order by contribution + contrib_df.sort_values('contribution', ascending=False, inplace=True) + + return contrib_df + + def _build_contrib_df(self, mean_by_trees_list, idx, how): """ Build contribution dataframe. From 73c406ce7bdfb384e25224be55cd358f0668c5c7 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 16:15:44 +0000 Subject: [PATCH 08/80] notebooks --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 628881d..aca1c60 100644 --- a/.gitignore +++ b/.gitignore @@ -40,4 +40,7 @@ env/ venv/ # VS code -.vscode \ No newline at end of file +.vscode + +# Notebooks +notebooks/** \ No newline at end of file From fe41d253b89821eb1a282bce459445af3d607b87 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 16:53:06 +0000 Subject: [PATCH 09/80] combined index and TODOs --- lorax/the_lorax.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index 87dd4cf..7e0c602 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -48,13 +48,19 @@ class TheLorax(object): each feature name in the test matrix must match one and only one pattern. """ - def __init__(self, clf, column_names, column_patterns=None, id_col=None, date_col=None): + def __init__(self, clf, column_names, column_patterns=None, id_col=None, date_col=None, outcome_col='outcome'): self.clf = clf + + # NOTE: Minor. maybe this should be feature_names self.column_names = column_names self.column_patterns = column_patterns self.id_col = id_col self.date_col = date_col + self.combined_index = False + if type(id_col) in [list, tuple]: + self.combined_index = True + def explain_example_new(self, sample=None, pred_class=None, num_features=10, @@ -65,6 +71,10 @@ def explain_example_new(self, sample=None, if sample is None and test_mat is None and idx is None: raise ValueError('Must either provide a data sample \ or a test matrix with a sample index') + + # A test matrix is necessary for getting descriptive stats + if descriptive and test_mat is None: + raise ValueError('Sould provide a test dataset for descriptive') if how == 'patterns' and self.column_patterns is None: raise ValueError('Must specify name patterns to aggregate over.' + @@ -72,7 +82,11 @@ def explain_example_new(self, sample=None, elif how not in ['features', 'patterns']: raise ValueError('How must be one of features or patterns.') - + # TODO: Add error handling for sample's features and the data features. + + if sample is None: + sample = test_mat.loc[idx].values + # Calculating Feature contributions if isinstance(self.clf, RandomForestClassifier): # Getting values for Random Forest Classifier @@ -103,8 +117,8 @@ def explain_example_new(self, sample=None, # TODO: Need to be modified to not taking the index # Replacing the example id with -1 for now logging.info('Used predicted class {} for example {}, score={}'.format(pred_class, - -1, - score)) + -1, + score)) # sorting in descending order by contribution then by feature name in the case of ties contrib_list.sort(key=lambda x: (x[1] * -1, x[0])) @@ -112,10 +126,11 @@ def explain_example_new(self, sample=None, self._build_contrib_df_sample(contrib_list, how=how) # TODO: If descriptive is set, the importance scores - # are supplimented with the context provided by a test dataset + # are supported with the context provided by a test dataset # The code is available in the original constructor, move it here if descriptive: pass + def old_init(self, clf, test_mat, id_col=None, date_col='as_of_date', outcome_col='outcome', From 3562138435866149ed18608f5ffc3d77c79af6af Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 17:41:42 +0000 Subject: [PATCH 10/80] scarpcode function. --- tests/test_new_lorax.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index ccd4aef..81cca2a 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -40,5 +40,9 @@ def test_lorax_breast_cancer(): lrx.explain_example_new(sample=sample) +def scrap_code(): + pass + + if __name__ == '__main__': test_lorax_breast_cancer() \ No newline at end of file From 0822c6895b3a28b4a42223d48d4a4bf9639fc82f Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 18:56:14 +0000 Subject: [PATCH 11/80] can generate descriptive and indiv sample explanations --- lorax/the_lorax.py | 51 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index 7e0c602..788d757 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -51,15 +51,35 @@ class TheLorax(object): def __init__(self, clf, column_names, column_patterns=None, id_col=None, date_col=None, outcome_col='outcome'): self.clf = clf - # NOTE: Minor. maybe this should be feature_names + # NOTE: Minor. maybe this should be feature_names and feature_patterns + # To separate from the index and the outcome self.column_names = column_names self.column_patterns = column_patterns + + # NOTE-KA: I feel like the method should be independent of these as these seem very triage specific. + # We can always have a script that bridges the triage data with the explain API self.id_col = id_col self.date_col = date_col + self.drop_cols = [] + if date_col is not None: + if date_col not in id_col: + self.drop_cols = [] + else: + self.drop_cols = [date_col] + + if outcome_col is not None: + self.drop_cols.append(outcome_col) + self.combined_index = False - if type(id_col) in [list, tuple]: - self.combined_index = True + if id_col is not None: + if type(id_col) in [list, tuple]: + self.combined_index = True + + # TODO: These should be moved out from the constructor. + # Current version of the code depends on their existence + self.X_test = None + self.feature_stats = None def explain_example_new(self, sample=None, pred_class=None, @@ -106,6 +126,8 @@ def explain_example_new(self, sample=None, if pred_class is None: score = self.clf.predict_proba(sample.reshape(1, -1)) score = score[0][0] + + # TODO: Multiclass adpatation pred_class = int(score >= 0.5) # TODO: handle this more elegantly for multiclass problems @@ -123,13 +145,30 @@ def explain_example_new(self, sample=None, # sorting in descending order by contribution then by feature name in the case of ties contrib_list.sort(key=lambda x: (x[1] * -1, x[0])) - self._build_contrib_df_sample(contrib_list, how=how) - # TODO: If descriptive is set, the importance scores # are supported with the context provided by a test dataset # The code is available in the original constructor, move it here if descriptive: - pass + cols = list(test_mat.columns) + + # Removing the columns that need to be dropped + # NOTE-KA: Similar to the comment in the constructor, I think this should be handled outside of Lorax + for dr_col in self.drop_cols: + if dr_col in cols: + test_mat = test_mat.drop(dr_col, axis=1) + + + self.X_test = test_mat + + # NOTE-KA: I think this method should take in the test dataset as an argument + self._populate_feature_stats() + contrib_df = self._build_contrib_df(contrib_list, idx=idx, how=how) + + else: + contrib_df = self._build_contrib_df_sample(contrib_list, how=how) + + print(contrib_df.head()) + def old_init(self, clf, test_mat, id_col=None, From 449668498d6a30e02d2070eca4634b9b82f719c5 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 18:56:50 +0000 Subject: [PATCH 12/80] tested the new explain examlpe --- tests/test_new_lorax.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 81cca2a..a1c54af 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -27,8 +27,8 @@ def test_lorax_breast_cancer(): max_depth = 2 global_clf = RandomForestClassifier(n_estimators=n_estimators, - max_depth=max_depth, - random_state=42).fit(X, y) + max_depth=max_depth, + random_state=42).fit(X, y) # Lorax lrx = TheLorax( @@ -38,7 +38,9 @@ def test_lorax_breast_cancer(): sample = X[0, :] - lrx.explain_example_new(sample=sample) + lrx.explain_example_new(sample=None, + descriptive=True, + test_mat=data, idx=1) def scrap_code(): pass From 913259a24b18e4ee600253788753e71cba731c21 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 19:39:05 +0000 Subject: [PATCH 13/80] returning the contribution df --- lorax/the_lorax.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index 788d757..7210cd3 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -48,7 +48,7 @@ class TheLorax(object): each feature name in the test matrix must match one and only one pattern. """ - def __init__(self, clf, column_names, column_patterns=None, id_col=None, date_col=None, outcome_col='outcome'): + def __init__(self, clf, column_names, column_patterns=None, id_col=None, date_col=None, outcome_col=None): self.clf = clf # NOTE: Minor. maybe this should be feature_names and feature_patterns @@ -60,6 +60,7 @@ def __init__(self, clf, column_names, column_patterns=None, id_col=None, date_co # We can always have a script that bridges the triage data with the explain API self.id_col = id_col self.date_col = date_col + self.outcome_col = outcome_col self.drop_cols = [] if date_col is not None: @@ -167,9 +168,8 @@ def explain_example_new(self, sample=None, else: contrib_df = self._build_contrib_df_sample(contrib_list, how=how) - print(contrib_df.head()) - - + return contrib_df + def old_init(self, clf, test_mat, id_col=None, date_col='as_of_date', outcome_col='outcome', From 2e3d6d9c3ea2862b1ba3b201d30b2c907091f58c Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 19:51:08 +0000 Subject: [PATCH 14/80] moved the drop_col removal, and changed drop_col list creation --- lorax/the_lorax.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index 7210cd3..b2ecf1e 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -63,11 +63,8 @@ def __init__(self, clf, column_names, column_patterns=None, id_col=None, date_co self.outcome_col = outcome_col self.drop_cols = [] - if date_col is not None: - if date_col not in id_col: - self.drop_cols = [] - else: - self.drop_cols = [date_col] + if date_col is not None and date_col not in id_col: + self.drop_cols.append(date_col) if outcome_col is not None: self.drop_cols.append(outcome_col) @@ -105,6 +102,18 @@ def explain_example_new(self, sample=None, # TODO: Add error handling for sample's features and the data features. + #checking for validity in the test data format + if test_mat is not None: + print(self.drop_cols) + cols = list(test_mat.columns) + + # Removing the columns that need to be dropped + # NOTE-KA: Similar to the comment in the constructor, I think this should be handled outside of Lorax + for dr_col in self.drop_cols: + if dr_col in cols: + test_mat = test_mat.drop(dr_col, axis=1) + + if sample is None: sample = test_mat.loc[idx].values @@ -124,10 +133,9 @@ def explain_example_new(self, sample=None, contrib_list = get_contrib_list_LR(self.clf, sample, self.column_names) # Setting the prediction class + score = self.clf.predict_proba(sample.reshape(1, -1)) + score = score[0][0] if pred_class is None: - score = self.clf.predict_proba(sample.reshape(1, -1)) - score = score[0][0] - # TODO: Multiclass adpatation pred_class = int(score >= 0.5) @@ -149,15 +157,7 @@ def explain_example_new(self, sample=None, # TODO: If descriptive is set, the importance scores # are supported with the context provided by a test dataset # The code is available in the original constructor, move it here - if descriptive: - cols = list(test_mat.columns) - - # Removing the columns that need to be dropped - # NOTE-KA: Similar to the comment in the constructor, I think this should be handled outside of Lorax - for dr_col in self.drop_cols: - if dr_col in cols: - test_mat = test_mat.drop(dr_col, axis=1) - + if descriptive: self.X_test = test_mat @@ -169,7 +169,7 @@ def explain_example_new(self, sample=None, contrib_df = self._build_contrib_df_sample(contrib_list, how=how) return contrib_df - + def old_init(self, clf, test_mat, id_col=None, date_col='as_of_date', outcome_col='outcome', From c61f6da299b675e9688711d9fced2174edeec3f0 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 19:58:22 +0000 Subject: [PATCH 15/80] setting the id_col as the index --- lorax/the_lorax.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index b2ecf1e..1ef810f 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -104,7 +104,6 @@ def explain_example_new(self, sample=None, #checking for validity in the test data format if test_mat is not None: - print(self.drop_cols) cols = list(test_mat.columns) # Removing the columns that need to be dropped @@ -113,6 +112,8 @@ def explain_example_new(self, sample=None, if dr_col in cols: test_mat = test_mat.drop(dr_col, axis=1) + if self.id_col is not None: + test_mat.set_index(self.id_col, inplace=True) if sample is None: sample = test_mat.loc[idx].values @@ -158,7 +159,6 @@ def explain_example_new(self, sample=None, # are supported with the context provided by a test dataset # The code is available in the original constructor, move it here if descriptive: - self.X_test = test_mat # NOTE-KA: I think this method should take in the test dataset as an argument From 9ff371f4dc2a50f011415135bc3ffe6e3c9e801c Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 20:13:24 +0000 Subject: [PATCH 16/80] changed the column names --- lorax/the_lorax.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index 1ef810f..9603e8a 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -102,7 +102,7 @@ def explain_example_new(self, sample=None, # TODO: Add error handling for sample's features and the data features. - #checking for validity in the test data format + # Formatting the test data matrix by setting appropriate index and removing non-feature coulmns if test_mat is not None: cols = list(test_mat.columns) @@ -121,7 +121,7 @@ def explain_example_new(self, sample=None, # Calculating Feature contributions if isinstance(self.clf, RandomForestClassifier): # Getting values for Random Forest Classifier - return_tuple = get_contrib_list_RF(self.clf, sample, self.column_names) + return_tuple = get_contrib_list_RF(self.clf, sample, test_mat.columns.values) num_trees = return_tuple[0] global_score_dict = return_tuple[1] @@ -131,7 +131,8 @@ def explain_example_new(self, sample=None, elif isinstance(self.clf, LogisticRegression): # Getting values for Random Forest Classifier - contrib_list = get_contrib_list_LR(self.clf, sample, self.column_names) + # TODO: The column names need to be consolidated + contrib_list = get_contrib_list_LR(self.clf, sample, test_mat.columns.values) # Setting the prediction class score = self.clf.predict_proba(sample.reshape(1, -1)) From a9755d72e68e0d54fe1984d44426689424b9a6ab Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 20:14:57 +0000 Subject: [PATCH 17/80] replicated Kits initial test for the new explain example --- tests/test_new_lorax.py | 69 ++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index a1c54af..bf8bc31 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -5,46 +5,65 @@ import pandas as pd import numpy as np - +import random +from datetime import datetime +from sklearn import datasets from sklearn.datasets import load_breast_cancer from sklearn.ensemble import RandomForestClassifier from lorax.the_lorax import TheLorax from lorax.utils import add_overall_feature_importance +import unittest + +# Data generation for classification +X, y = datasets.make_classification(n_samples=10000, n_features=5, + n_informative=3, n_redundant=2, + random_state=42) -def test_lorax_breast_cancer(): - data_dict = load_breast_cancer() - X = data_dict['data'] - y = data_dict['target'] +# Preparing test matrix +start, end = datetime(2017, 1, 1), datetime(2017, 12, 31) +as_of_dates = np.asarray([start + (end - start) * random.random() for i in range(X.shape[0])]) +entity_ids = np.arange(1, X.shape[0] + 1) - columns = data_dict['feature_names'] +data = np.append(X, y.reshape(y.shape[0], 1), axis=1) +data = np.append(as_of_dates.reshape(y.shape[0], 1), data, axis=1) +data = np.append(entity_ids.reshape(y.shape[0], 1), data, axis=1) - data = pd.DataFrame(X, columns=columns) +columns = ["entity_id", "as_of_date", "feature1", "feature2", + "feature3", "feature4", "feature5", "outcome"] - # model - n_estimators = 2 - max_depth = 2 +data = pd.DataFrame(data, columns=columns) - global_clf = RandomForestClassifier(n_estimators=n_estimators, - max_depth=max_depth, - random_state=42).fit(X, y) +n_estimators = 2 +max_depth = 2 +global_clf = RandomForestClassifier(n_estimators=n_estimators, + max_depth=max_depth, + random_state=42).fit(X, y) - # Lorax - lrx = TheLorax( - clf=global_clf, - column_names=columns, - id_col=None, date_col=None) +class TestLorax(unittest.TestCase): + """Tests cases for Lorax.""" - sample = X[0, :] + def test_calculated_feature_importances(self): + """Test calculated feature importances.""" + # Setting up lorax + lrx = TheLorax( + global_clf, + column_names=columns, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + lrx_out = lrx.explain_example_new(test_mat=data, idx=1, pred_class=1, graph=False) - lrx.explain_example_new(sample=None, - descriptive=True, - test_mat=data, idx=1) + feature1_contrib = lrx_out.contribution.loc['feature1'] + feature5_contrib = lrx_out.contribution.loc['feature5'] -def scrap_code(): - pass + # Test cases for correct feature importances + self.assertEqual(feature1_contrib, 0.04889021376498209) + self.assertEqual(feature5_contrib, -0.31556073962118303) + self.assertFalse('feature3' in lrx_out.contribution) if __name__ == '__main__': - test_lorax_breast_cancer() \ No newline at end of file + # test_lorax_breast_cancer() + unittest.main() \ No newline at end of file From 7d1aa6238fb6feb56df4dddf9b988c8e67883488 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 20:55:04 +0000 Subject: [PATCH 18/80] comment --- lorax/the_lorax.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index 9603e8a..08838ef 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -122,7 +122,6 @@ def explain_example_new(self, sample=None, if isinstance(self.clf, RandomForestClassifier): # Getting values for Random Forest Classifier return_tuple = get_contrib_list_RF(self.clf, sample, test_mat.columns.values) - num_trees = return_tuple[0] global_score_dict = return_tuple[1] feature_dict = return_tuple[2] @@ -137,8 +136,10 @@ def explain_example_new(self, sample=None, # Setting the prediction class score = self.clf.predict_proba(sample.reshape(1, -1)) score = score[0][0] + if pred_class is None: # TODO: Multiclass adpatation + # use np.argmax(), or clf.predict() pred_class = int(score >= 0.5) # TODO: handle this more elegantly for multiclass problems From a56b74094112caa1aef4257bfad706a55a320478 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 21:02:31 +0000 Subject: [PATCH 19/80] sample is a series --- lorax/the_lorax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index 08838ef..a474236 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -134,7 +134,7 @@ def explain_example_new(self, sample=None, contrib_list = get_contrib_list_LR(self.clf, sample, test_mat.columns.values) # Setting the prediction class - score = self.clf.predict_proba(sample.reshape(1, -1)) + score = self.clf.predict_proba(sample.values.reshape(1, -1)) score = score[0][0] if pred_class is None: From 67b86d132d92be27df57734ff12ba452ab05ca97 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 21:03:44 +0000 Subject: [PATCH 20/80] tested, dependence on special columns and inputting the example as a series --- tests/test_new_lorax.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index bf8bc31..5d7c692 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -35,6 +35,10 @@ data = pd.DataFrame(data, columns=columns) +# Testing the independence from id_col, date_col, outcome +data = data.drop(['entity_id', 'as_of_date', 'outcome'], axis=1) + + n_estimators = 2 max_depth = 2 global_clf = RandomForestClassifier(n_estimators=n_estimators, @@ -50,10 +54,23 @@ def test_calculated_feature_importances(self): lrx = TheLorax( global_clf, column_names=columns, - id_col='entity_id', - date_col='as_of_date', - outcome_col='outcome') - lrx_out = lrx.explain_example_new(test_mat=data, idx=1, pred_class=1, graph=False) + id_col=None, + date_col=None, + outcome_col=None) + + # without id_col (zero indexed) + # lrx_out = lrx.explain_example_new(test_mat=data, idx=0, pred_class=1, graph=False) + + sample = data.loc[0] + lrx_out = lrx.explain_example_new( + sample=sample, + test_mat=data, + idx=None, + pred_class=None, + graph=False) + + + print(lrx_out) feature1_contrib = lrx_out.contribution.loc['feature1'] feature5_contrib = lrx_out.contribution.loc['feature5'] From dfe50ddfc43181f10d33a25198b5f0c49f24df96 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 21:09:15 +0000 Subject: [PATCH 21/80] sample has to be a np.array --- lorax/the_lorax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index a474236..08838ef 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -134,7 +134,7 @@ def explain_example_new(self, sample=None, contrib_list = get_contrib_list_LR(self.clf, sample, test_mat.columns.values) # Setting the prediction class - score = self.clf.predict_proba(sample.values.reshape(1, -1)) + score = self.clf.predict_proba(sample.reshape(1, -1)) score = score[0][0] if pred_class is None: From a909f1f4e2a481941befed7e05922ea786230f3a Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 21:10:54 +0000 Subject: [PATCH 22/80] tested np.array sample --- tests/test_new_lorax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 5d7c692..152fb7c 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -61,7 +61,7 @@ def test_calculated_feature_importances(self): # without id_col (zero indexed) # lrx_out = lrx.explain_example_new(test_mat=data, idx=0, pred_class=1, graph=False) - sample = data.loc[0] + sample = data.loc[0].values lrx_out = lrx.explain_example_new( sample=sample, test_mat=data, From 690bec644faf8e9a8d5d3e5d3649ce7656088bcd Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 21:31:30 +0000 Subject: [PATCH 23/80] changed the input configuration conditions --- lorax/the_lorax.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index 08838ef..1e302f8 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -86,13 +86,12 @@ def explain_example_new(self, sample=None, descriptive=False, test_mat=None, idx=None, graph=False): # User has to pass either an index and a test_mat or a samples (a row) - if sample is None and test_mat is None and idx is None: - raise ValueError('Must either provide a data sample \ - or a test matrix with a sample index') + if sample is None and (test_mat is None or idx is None): + raise ValueError('Must either provide a data sample or a test matrix with a sample index') # A test matrix is necessary for getting descriptive stats - if descriptive and test_mat is None: - raise ValueError('Sould provide a test dataset for descriptive') + if descriptive and (test_mat is None or idx is None): + raise ValueError('Sould provide a test dataset and a sample index for descriptive') if how == 'patterns' and self.column_patterns is None: raise ValueError('Must specify name patterns to aggregate over.' + @@ -115,8 +114,8 @@ def explain_example_new(self, sample=None, if self.id_col is not None: test_mat.set_index(self.id_col, inplace=True) - if sample is None: - sample = test_mat.loc[idx].values + if idx is not None: + sample = test_mat.loc[idx].values # Calculating Feature contributions if isinstance(self.clf, RandomForestClassifier): @@ -378,7 +377,7 @@ def _build_contrib_df(self, mean_by_trees_list, idx, how): # lookup the specific example's values for col in contrib_df.index.values: - + # NOTE-KA: This way, the sample has to be an element of the test dataset if self.combined_index: example_value = self.X_test.loc[idx, col].values[0] else: From 51652b2ddcddb40558868d52fb16fd59f19df66b Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 21:34:23 +0000 Subject: [PATCH 24/80] added the graph command to descriptive --- lorax/the_lorax.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index 1e302f8..94f3a82 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -159,16 +159,23 @@ def explain_example_new(self, sample=None, # TODO: If descriptive is set, the importance scores # are supported with the context provided by a test dataset # The code is available in the original constructor, move it here - if descriptive: + if descriptive: + # If descriptive, it rolls back to the original case self.X_test = test_mat # NOTE-KA: I think this method should take in the test dataset as an argument self._populate_feature_stats() contrib_df = self._build_contrib_df(contrib_list, idx=idx, how=how) + if graph: + self._plot_graph(idx, pred_class, score, + num_features, contrib_df, how) + else: contrib_df = self._build_contrib_df_sample(contrib_list, how=how) + + return contrib_df From cbcffb4b003306e18f832896bc587428e694df20 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 22:20:43 +0000 Subject: [PATCH 25/80] fixed the plot errors and added a note --- lorax/the_lorax.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lorax/the_lorax.py b/lorax/the_lorax.py index 94f3a82..5b45769 100644 --- a/lorax/the_lorax.py +++ b/lorax/the_lorax.py @@ -168,13 +168,16 @@ def explain_example_new(self, sample=None, contrib_df = self._build_contrib_df(contrib_list, idx=idx, how=how) if graph: + # NOTE-KA: num features seem to be a unnecessary dependency. + # Defaulting to 10 in the function signature is less general. + # For now adding a error check, if passed value > # features in the df, will plot all values + # But, I feel like it should be handled more elegantly + num_features = min(num_features, contrib_df.shape[0]) self._plot_graph(idx, pred_class, score, num_features, contrib_df, how) else: contrib_df = self._build_contrib_df_sample(contrib_list, how=how) - - return contrib_df From ec27c3961fa621e691755a44e325931a121d51e9 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Fri, 24 Jan 2020 22:21:09 +0000 Subject: [PATCH 26/80] tested plot --- tests/test_new_lorax.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 152fb7c..9720465 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -53,7 +53,7 @@ def test_calculated_feature_importances(self): # Setting up lorax lrx = TheLorax( global_clf, - column_names=columns, + column_names=data.columns.values, id_col=None, date_col=None, outcome_col=None) @@ -63,11 +63,13 @@ def test_calculated_feature_importances(self): sample = data.loc[0].values lrx_out = lrx.explain_example_new( - sample=sample, + sample=None, test_mat=data, - idx=None, - pred_class=None, - graph=False) + descriptive=True, + idx=0, + pred_class=1, + num_features=10, + graph=True) print(lrx_out) From 771f87b005fd396b6447718f13260253921a95d0 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Mon, 27 Jan 2020 23:23:58 +0000 Subject: [PATCH 27/80] renamed to original --- lorax/{the_lorax.py => lorax.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename lorax/{the_lorax.py => lorax.py} (100%) diff --git a/lorax/the_lorax.py b/lorax/lorax.py similarity index 100% rename from lorax/the_lorax.py rename to lorax/lorax.py From 17b3bf6b4cf980c5e6cb73866562660d449386e9 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Tue, 28 Jan 2020 00:29:19 +0000 Subject: [PATCH 28/80] reverted the package name --- lorax/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lorax/__init__.py b/lorax/__init__.py index 1e64be3..6e34b36 100644 --- a/lorax/__init__.py +++ b/lorax/__init__.py @@ -1,3 +1,3 @@ """Main application.""" -from .the_lorax import TheLorax +from .lorax import TheLorax From e9fe8a686bbc4a104be6a07279d52b75e6a549b2 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Tue, 28 Jan 2020 17:55:21 +0000 Subject: [PATCH 29/80] feature names for explainer --- lorax/lorax.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 5b45769..85c318b 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -120,7 +120,7 @@ def explain_example_new(self, sample=None, # Calculating Feature contributions if isinstance(self.clf, RandomForestClassifier): # Getting values for Random Forest Classifier - return_tuple = get_contrib_list_RF(self.clf, sample, test_mat.columns.values) + return_tuple = get_contrib_list_RF(self.clf, sample, self.column_names) num_trees = return_tuple[0] global_score_dict = return_tuple[1] feature_dict = return_tuple[2] @@ -130,7 +130,7 @@ def explain_example_new(self, sample=None, elif isinstance(self.clf, LogisticRegression): # Getting values for Random Forest Classifier # TODO: The column names need to be consolidated - contrib_list = get_contrib_list_LR(self.clf, sample, test_mat.columns.values) + contrib_list = get_contrib_list_LR(self.clf, sample, self.column_names) # Setting the prediction class score = self.clf.predict_proba(sample.reshape(1, -1)) From b96c17009fdace57006ab257e1ee286da7a25db1 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 29 Jan 2020 21:09:14 +0000 Subject: [PATCH 30/80] sending a series to explain example --- lorax/lorax.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 85c318b..62ee0f5 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -100,7 +100,9 @@ def explain_example_new(self, sample=None, raise ValueError('How must be one of features or patterns.') # TODO: Add error handling for sample's features and the data features. - + if isinstance(sample, pd.Series): + sample = sample.values + # Formatting the test data matrix by setting appropriate index and removing non-feature coulmns if test_mat is not None: cols = list(test_mat.columns) From 42ceed1d88a69cab0f133e3d981925ace23d6c43 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 12 Feb 2020 21:21:52 +0000 Subject: [PATCH 31/80] fixed the prediction label issue --- lorax/lorax.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 62ee0f5..dd432a5 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -2,6 +2,7 @@ import re import logging import pandas as pd +import numpy as np from math import sqrt from scipy import stats @@ -135,13 +136,17 @@ def explain_example_new(self, sample=None, contrib_list = get_contrib_list_LR(self.clf, sample, self.column_names) # Setting the prediction class - score = self.clf.predict_proba(sample.reshape(1, -1)) - score = score[0][0] + # predict proba returns a 2D array + scores = self.clf.predict_proba(sample.reshape(1, -1))[0] if pred_class is None: # TODO: Multiclass adpatation # use np.argmax(), or clf.predict() - pred_class = int(score >= 0.5) + pred_class = np.argmax(scores) + + # Prediction score for the given pred_class + score = scores[pred_class] + print(pred_class, score) # TODO: handle this more elegantly for multiclass problems # We need to flip the sign of the scores. @@ -650,4 +655,5 @@ def speak_for_the_trees(self, id, pred_class=None, num_features=20, graph=True, This method is just a synonym for `explain_example()` because TheLorax has to be able to speak for the trees. """ + # TODO: Make sure this is adapted to the new method return self.explain_example(id, pred_class, num_features, graph, how) From 8dd1c99fe41c0a7889c0c1455964f739d89b8cea Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 12 Feb 2020 21:23:17 +0000 Subject: [PATCH 32/80] tested the prediction label vs feature importance sign --- tests/test_new_lorax.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 9720465..618ff8e 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -11,7 +11,7 @@ from sklearn.datasets import load_breast_cancer from sklearn.ensemble import RandomForestClassifier -from lorax.the_lorax import TheLorax +from lorax.lorax import TheLorax from lorax.utils import add_overall_feature_importance import unittest @@ -82,7 +82,36 @@ def test_calculated_feature_importances(self): self.assertEqual(feature5_contrib, -0.31556073962118303) self.assertFalse('feature3' in lrx_out.contribution) +def test_scores_lorax(): + lrx = TheLorax( + global_clf, + column_names=data.columns.values, + id_col=None, + date_col=None, + outcome_col=None) + + # without id_col (zero indexed) + # lrx_out = lrx.explain_example_new(test_mat=data, idx=0, pred_class=1, graph=False) + + sample = data.loc[0].values + lrx_out = lrx.explain_example_new( + sample=sample, + test_mat=data, + descriptive=True, + idx=0, + pred_class=0, + num_features=10, + graph=False) + + + print(lrx_out) + feature1_contrib = lrx_out.contribution.loc['feature1'] + feature5_contrib = lrx_out.contribution.loc['feature5'] + + print(feature1_contrib, feature5_contrib) + if __name__ == '__main__': # test_lorax_breast_cancer() - unittest.main() \ No newline at end of file + # unittest.main() + test_scores_lorax() \ No newline at end of file From d46aabb89b12c9ba4be5873412fe5eac81f18628 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 12 Feb 2020 21:26:04 +0000 Subject: [PATCH 33/80] removed a comment --- lorax/lorax.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index dd432a5..7755373 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -146,8 +146,7 @@ def explain_example_new(self, sample=None, # Prediction score for the given pred_class score = scores[pred_class] - print(pred_class, score) - + # TODO: handle this more elegantly for multiclass problems # We need to flip the sign of the scores. if pred_class == 0: From 87a510b7688af25ae16399b091d2537bbb96e421 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 12 Feb 2020 21:29:43 +0000 Subject: [PATCH 34/80] comment --- tests/test_new_lorax.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 618ff8e..eb04736 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -83,6 +83,7 @@ def test_calculated_feature_importances(self): self.assertFalse('feature3' in lrx_out.contribution) def test_scores_lorax(): + """New test to check the importance scores and their signs (+) or (-)""" lrx = TheLorax( global_clf, column_names=data.columns.values, From a94886f55405a22296b0e09be1a90fee76539711 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 13 Feb 2020 22:40:15 +0000 Subject: [PATCH 35/80] the model info dictionary --- lorax/lorax.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 7755373..b6eead2 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -80,6 +80,9 @@ def __init__(self, clf, column_names, column_patterns=None, id_col=None, date_co self.X_test = None self.feature_stats = None + # When populated, this will contain the component information of the model + self.model_info = dict() + def explain_example_new(self, sample=None, pred_class=None, num_features=10, @@ -124,12 +127,17 @@ def explain_example_new(self, sample=None, if isinstance(self.clf, RandomForestClassifier): # Getting values for Random Forest Classifier return_tuple = get_contrib_list_RF(self.clf, sample, self.column_names) - num_trees = return_tuple[0] - global_score_dict = return_tuple[1] - feature_dict = return_tuple[2] - aggregated_dict = return_tuple[3] + + # Feature importance scores contrib_list = return_tuple[4] + # Model component information + self.model_info['num_trees'] = return_tuple[0] + self.model_info['global_score_dict'] = return_tuple[1] + self.model_info['feature_dict'] = return_tuple[2] + self.model_info['aggregated_dict'] = return_tuple[3] + + elif isinstance(self.clf, LogisticRegression): # Getting values for Random Forest Classifier # TODO: The column names need to be consolidated From 7f7d158ce7380b259865c5adfad49e954e67ef48 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 13 Feb 2020 23:09:51 +0000 Subject: [PATCH 36/80] added the overall feature importance stuff from the olf method --- lorax/lorax.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index b6eead2..52986a5 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -135,8 +135,7 @@ def explain_example_new(self, sample=None, self.model_info['num_trees'] = return_tuple[0] self.model_info['global_score_dict'] = return_tuple[1] self.model_info['feature_dict'] = return_tuple[2] - self.model_info['aggregated_dict'] = return_tuple[3] - + self.model_info['aggregated_dict'] = return_tuple[3] elif isinstance(self.clf, LogisticRegression): # Getting values for Random Forest Classifier @@ -181,6 +180,28 @@ def explain_example_new(self, sample=None, self._populate_feature_stats() contrib_df = self._build_contrib_df(contrib_list, idx=idx, how=how) + + # adding overall feature importance from model level + overall_importance = [] + for i, cname in enumerate(self.column_names): + if isinstance(self.clf, LogisticRegression): + overall_importance.append((cname, self.clf.coef_[0][i])) + + elif isinstance(self.clf, RandomForestClassifier): + overall_importance.append((cname, self.clf.feature_importances_[i])) + + else: + pass + + updated_list = add_overall_feature_importance(contrib_list, overall_importance) + updated_columns = ['feature', 'sample_rank', 'overall_imp', 'overall_rank', 'rank_change'] + contrib_df = contrib_df.join( + pd.DataFrame( + data=updated_list, + columns=updated_columns + ).set_index('feature') + ) + if graph: # NOTE-KA: num features seem to be a unnecessary dependency. # Defaulting to 10 in the function signature is less general. @@ -192,7 +213,7 @@ def explain_example_new(self, sample=None, else: contrib_df = self._build_contrib_df_sample(contrib_list, how=how) - + return contrib_df From aefa0c0a3eca6b89d2dc0f1fc74f456cbc284243 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 13 Feb 2020 23:12:29 +0000 Subject: [PATCH 37/80] PEP8 --- lorax/lorax.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 52986a5..0a49763 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -180,7 +180,6 @@ def explain_example_new(self, sample=None, self._populate_feature_stats() contrib_df = self._build_contrib_df(contrib_list, idx=idx, how=how) - # adding overall feature importance from model level overall_importance = [] for i, cname in enumerate(self.column_names): From 7fdd7cd0a36494dc6019c5a77520c94d2ca5f9c2 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 19 Feb 2020 21:42:58 +0000 Subject: [PATCH 38/80] function to load a test_mat and calculate the feature stats once --- lorax/lorax.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 0a49763..99c6dd7 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -49,7 +49,7 @@ class TheLorax(object): each feature name in the test matrix must match one and only one pattern. """ - def __init__(self, clf, column_names, column_patterns=None, id_col=None, date_col=None, outcome_col=None): + def __init__(self, clf, column_names, test_mat=None, column_patterns=None, id_col=None, date_col=None, outcome_col=None): self.clf = clf # NOTE: Minor. maybe this should be feature_names and feature_patterns @@ -63,6 +63,7 @@ def __init__(self, clf, column_names, column_patterns=None, id_col=None, date_co self.date_col = date_col self.outcome_col = outcome_col + # TODO: Move this to the load dataset self.drop_cols = [] if date_col is not None and date_col not in id_col: self.drop_cols.append(date_col) @@ -78,11 +79,60 @@ def __init__(self, clf, column_names, column_patterns=None, id_col=None, date_co # TODO: These should be moved out from the constructor. # Current version of the code depends on their existence self.X_test = None + self.y_test = None + self.preds = None self.feature_stats = None + if test_mat is not None: + self.load_dataset(test_mat=test_mat, id_col=id_col, date_col=date_col, oucome_col=outcome_col) + # When populated, this will contain the component information of the model self.model_info = dict() + def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outcome_col=None): + """ A user can load a test dataset to the object. + This dataset can be used to provide context to the individual predition explanations. + + param test_mat: A pandas dataframe + param id_col: The name(s) of the columns to uniquely identify an instance (entity_id in triage land) + param date_col: + param outcome_col: + + return: None + """ + + df = test_mat.copy() + + if id_col is not None: + df.set_index(id_col, inplace=True) + # TODO: minor, check whether this is the ideal way of doing this + if type(id_col) in [list, tuple]: + self.combined_index = True + + drop_cols = list() + if date_col not in id_col: + drop_cols.append(drop_cols) + + if outcome_col is not None: + drop_cols.append(outcome_col) + + self.y_test = df[outcome_col] + + self.X_test = df.drop(drop_cols, axis=1) + + # Setting the predictions of the test dataset + self.preds = pd.DataFrame( + {'pred': [p[1] for p in self.clf.predict_proba(self.X_test.values)]}, + index=self.X_test.index + ) + + # For classifiers with intercepts, we add the intercept as a "feature" + if hasattr(self.clf, 'intercept_'): + self.X_test["Intercept"] = [self.clf.intercept_[0] for i in range(len(self.X_test))] + + # pre-calculating the feature distributions + self._populate_feature_stats() + def explain_example_new(self, sample=None, pred_class=None, num_features=10, From a051ce56696766f921c8599555f2bc541654e12a Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 19 Feb 2020 21:43:25 +0000 Subject: [PATCH 39/80] tested whether the code broke --- tests/test_new_lorax.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index eb04736..73b3c58 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -97,8 +97,8 @@ def test_scores_lorax(): sample = data.loc[0].values lrx_out = lrx.explain_example_new( sample=sample, - test_mat=data, - descriptive=True, + test_mat=None, + descriptive=False, idx=0, pred_class=0, num_features=10, From 1efa5b8ee1e14c0c8500f092e92fe605b7b61926 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 20 Feb 2020 18:05:58 +0000 Subject: [PATCH 40/80] moved the drop cols handling to load dataset --- lorax/lorax.py | 53 ++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 99c6dd7..60720db 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -64,12 +64,12 @@ def __init__(self, clf, column_names, test_mat=None, column_patterns=None, id_co self.outcome_col = outcome_col # TODO: Move this to the load dataset - self.drop_cols = [] - if date_col is not None and date_col not in id_col: - self.drop_cols.append(date_col) + # self.drop_cols = [] + # if date_col is not None and date_col not in id_col: + # self.drop_cols.append(date_col) - if outcome_col is not None: - self.drop_cols.append(outcome_col) + # if outcome_col is not None: + # self.drop_cols.append(outcome_col) self.combined_index = False if id_col is not None: @@ -81,17 +81,19 @@ def __init__(self, clf, column_names, test_mat=None, column_patterns=None, id_co self.X_test = None self.y_test = None self.preds = None + self.drop_cols = list() self.feature_stats = None if test_mat is not None: - self.load_dataset(test_mat=test_mat, id_col=id_col, date_col=date_col, oucome_col=outcome_col) + self.load_dataset(test_mat=test_mat, id_col=id_col, date_col=date_col, outcome_col=outcome_col) # When populated, this will contain the component information of the model self.model_info = dict() def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outcome_col=None): - """ A user can load a test dataset to the object. + """ Loading a test dataset to the object. This dataset can be used to provide context to the individual predition explanations. + Context entails observing feature distributions param test_mat: A pandas dataframe param id_col: The name(s) of the columns to uniquely identify an instance (entity_id in triage land) @@ -108,17 +110,17 @@ def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outco # TODO: minor, check whether this is the ideal way of doing this if type(id_col) in [list, tuple]: self.combined_index = True - - drop_cols = list() - if date_col not in id_col: - drop_cols.append(drop_cols) + + # exclude non-feature columns (date, outcome if present) + if date_col not in id_col: + self.drop_cols.append(date_col) if outcome_col is not None: - drop_cols.append(outcome_col) + self.drop_cols.append(outcome_col) self.y_test = df[outcome_col] - self.X_test = df.drop(drop_cols, axis=1) + self.X_test = df.drop(self.drop_cols, axis=1) # Setting the predictions of the test dataset self.preds = pd.DataFrame( @@ -137,15 +139,17 @@ def explain_example_new(self, sample=None, pred_class=None, num_features=10, how='features', - descriptive=False, test_mat=None, idx=None, graph=False): + descriptive=False, + test_mat=None, + idx=None, graph=False): # User has to pass either an index and a test_mat or a samples (a row) if sample is None and (test_mat is None or idx is None): raise ValueError('Must either provide a data sample or a test matrix with a sample index') # A test matrix is necessary for getting descriptive stats - if descriptive and (test_mat is None or idx is None): - raise ValueError('Sould provide a test dataset and a sample index for descriptive') + if descriptive and (test_mat is None and sample is None and idx is None): + raise ValueError('Sould provide a test dataset and a sample/sample index for descriptive') if how == 'patterns' and self.column_patterns is None: raise ValueError('Must specify name patterns to aggregate over.' + @@ -158,18 +162,17 @@ def explain_example_new(self, sample=None, sample = sample.values # Formatting the test data matrix by setting appropriate index and removing non-feature coulmns - if test_mat is not None: - cols = list(test_mat.columns) - - # Removing the columns that need to be dropped - # NOTE-KA: Similar to the comment in the constructor, I think this should be handled outside of Lorax - for dr_col in self.drop_cols: - if dr_col in cols: - test_mat = test_mat.drop(dr_col, axis=1) - + if test_mat is not None: + # Indexing and exclusing non-feature columns + # NOTE-KA: I think this should be handled outside of Lorax if self.id_col is not None: test_mat.set_index(self.id_col, inplace=True) + for dr_col in self.drop_cols: + # Dropping the non-feature columns in the new test matrix, if they exist + # TODO: Handle the ID cols, Date cols elegantly + test_mat = test_mat.drop(dr_col, axis=1, errors='ignore') + if idx is not None: sample = test_mat.loc[idx].values From 1cfd88649669daad113d80e06450331cc0ac814c Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 20 Feb 2020 18:06:54 +0000 Subject: [PATCH 41/80] tested descriptive explanations. Noticed error in contrib_df_creation --- tests/test_new_lorax.py | 56 ++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 73b3c58..b645a76 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -48,12 +48,13 @@ class TestLorax(unittest.TestCase): """Tests cases for Lorax.""" - def test_calculated_feature_importances(self): + def test_feature_importances(self): """Test calculated feature importances.""" # Setting up lorax lrx = TheLorax( - global_clf, + clf=global_clf, column_names=data.columns.values, + test_mat=None, id_col=None, date_col=None, outcome_col=None) @@ -62,26 +63,45 @@ def test_calculated_feature_importances(self): # lrx_out = lrx.explain_example_new(test_mat=data, idx=0, pred_class=1, graph=False) sample = data.loc[0].values + pred_class = 0 # The label w.r.t the explanations are generated lrx_out = lrx.explain_example_new( + sample=sample, + test_mat=data, + descriptive=True, + idx=None, + pred_class=pred_class, + num_features=10, + graph=False + ) + + lrx_out2 = lrx.explain_example_new( sample=None, test_mat=data, descriptive=True, idx=0, - pred_class=1, + pred_class=pred_class, num_features=10, - graph=True) - - - print(lrx_out) + graph=False + ) feature1_contrib = lrx_out.contribution.loc['feature1'] feature5_contrib = lrx_out.contribution.loc['feature5'] + print('Asserting feature importance scores...') # Test cases for correct feature importances - self.assertEqual(feature1_contrib, 0.04889021376498209) - self.assertEqual(feature5_contrib, -0.31556073962118303) + if pred_class == 1: + self.assertEqual(feature1_contrib, 0.04889021376498209) + self.assertEqual(feature5_contrib, -0.31556073962118303) + else: + self.assertEqual(feature1_contrib, -0.04889021376498209) + self.assertEqual(feature5_contrib, 0.31556073962118303) + self.assertFalse('feature3' in lrx_out.contribution) + print('Asserting the descriptive feature contributions...') + + self.assertEqual(lrx_out, lrx_out2) + def test_scores_lorax(): """New test to check the importance scores and their signs (+) or (-)""" lrx = TheLorax( @@ -104,15 +124,21 @@ def test_scores_lorax(): num_features=10, graph=False) + lrx.load_dataset(test_mat=data) + + print(self.X_test.head()) + + # print(lrx_out) + + + # feature1_contrib = lrx_out.contribution.loc['feature1'] + # feature5_contrib = lrx_out.contribution.loc['feature5'] - print(lrx_out) - feature1_contrib = lrx_out.contribution.loc['feature1'] - feature5_contrib = lrx_out.contribution.loc['feature5'] + # print(feature1_contrib, feature5_contrib) - print(feature1_contrib, feature5_contrib) if __name__ == '__main__': # test_lorax_breast_cancer() - # unittest.main() - test_scores_lorax() \ No newline at end of file + unittest.main() + # test_scores_lorax() \ No newline at end of file From e7a01cf4f4f05c7847047d788b38e8c0d432d407 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 20 Feb 2020 18:27:59 +0000 Subject: [PATCH 42/80] removed X_test dependency and added capability to send a sample to contrib df --- lorax/lorax.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 60720db..cd11f2f 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -448,18 +448,21 @@ def _build_contrib_df_sample(self, mean_by_trees_list, how): return contrib_df - def _build_contrib_df(self, mean_by_trees_list, idx, how): + def _build_contrib_df(self, mean_by_trees_list, test_mat, idx, sample, how='features'): """ Build contribution dataframe. In: - mean_by_trees_list (list): + - test_mat: The reference test matrix, a dataframe - idx: index for example + - sample: the row matrix of the sample. Either a idx or sample should be provided - how: Whether to calculate feature contributions at the level of individual features Out: - contrib_df (pandas DF) """ + contrib_df = pd.DataFrame(mean_by_trees_list, columns=['feature', 'contribution']) contrib_df.set_index('feature', inplace=True) @@ -473,15 +476,21 @@ def _build_contrib_df(self, mean_by_trees_list, idx, how): contrib_df = contrib_df.join(self.feature_stats, how='left') # lookup the specific example's values - for col in contrib_df.index.values: + for i, col in enumerate(contrib_df.index.values): # NOTE-KA: This way, the sample has to be an element of the test dataset if self.combined_index: - example_value = self.X_test.loc[idx, col].values[0] + if idx is not None: + example_value = test_mat.loc[idx, col].values[0] + else: + example_value = sample[i] else: - example_value = self.X_test.loc[idx, col] + if idx is not None: + example_value = self.X_test.loc[idx, col] + else: + example_value = sample[i] contrib_df.loc[col, 'example_value'] = example_value - vals, pct_sco = self.X_test[col], example_value + vals, pct_sco = test_mat[col], example_value contrib_df.loc[col, 'example_pctl'] = stats.percentileofscore(vals, pct_sco) / 100.0 contrib_df['z_score'] = 1.0 * (contrib_df['example_value'] - contrib_df['mean']) From d4d9f24d814302f12930a1d9adddac12d1c07d2c Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 20 Feb 2020 18:28:31 +0000 Subject: [PATCH 43/80] removed note --- lorax/lorax.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index cd11f2f..f2e4ebd 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -477,7 +477,6 @@ def _build_contrib_df(self, mean_by_trees_list, test_mat, idx, sample, how='feat # lookup the specific example's values for i, col in enumerate(contrib_df.index.values): - # NOTE-KA: This way, the sample has to be an element of the test dataset if self.combined_index: if idx is not None: example_value = test_mat.loc[idx, col].values[0] From c0ffa3e6933929198c46f525b4f49dfba672b814 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 20 Feb 2020 19:17:09 +0000 Subject: [PATCH 44/80] linting --- lorax/lorax.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index f2e4ebd..99e4f5f 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -148,8 +148,8 @@ def explain_example_new(self, sample=None, raise ValueError('Must either provide a data sample or a test matrix with a sample index') # A test matrix is necessary for getting descriptive stats - if descriptive and (test_mat is None and sample is None and idx is None): - raise ValueError('Sould provide a test dataset and a sample/sample index for descriptive') + if descriptive and (test_mat is None and self.X_test is None): + raise ValueError('Sould provide a test dataset or should have preloaded a test dataset') if how == 'patterns' and self.column_patterns is None: raise ValueError('Must specify name patterns to aggregate over.' + @@ -227,11 +227,18 @@ def explain_example_new(self, sample=None, # The code is available in the original constructor, move it here if descriptive: # If descriptive, it rolls back to the original case - self.X_test = test_mat + if test_mat is None: + test_mat = self.X_test # NOTE-KA: I think this method should take in the test dataset as an argument self._populate_feature_stats() - contrib_df = self._build_contrib_df(contrib_list, idx=idx, how=how) + contrib_df = self._build_contrib_df( + contrib_list, + test_mat=test_mat, + idx=idx, + sample=sample, + how=how + ) # adding overall feature importance from model level overall_importance = [] @@ -462,7 +469,6 @@ def _build_contrib_df(self, mean_by_trees_list, test_mat, idx, sample, how='feat Out: - contrib_df (pandas DF) """ - contrib_df = pd.DataFrame(mean_by_trees_list, columns=['feature', 'contribution']) contrib_df.set_index('feature', inplace=True) From e42b7be9dbe78236becb64b8d26154fa9fbbd564 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 20 Feb 2020 19:39:37 +0000 Subject: [PATCH 45/80] feature stats takes a dataframe as argument --- lorax/lorax.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 99e4f5f..6bfb60c 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -133,7 +133,7 @@ def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outco self.X_test["Intercept"] = [self.clf.intercept_[0] for i in range(len(self.X_test))] # pre-calculating the feature distributions - self._populate_feature_stats() + self.feature_stats = self._populate_feature_stats(test_mat=self.X_test) def explain_example_new(self, sample=None, pred_class=None, @@ -229,14 +229,16 @@ def explain_example_new(self, sample=None, # If descriptive, it rolls back to the original case if test_mat is None: test_mat = self.X_test + fstats = self.feature_stats + else: + fstats = self._populate_feature_stats(test_mat) - # NOTE-KA: I think this method should take in the test dataset as an argument - self._populate_feature_stats() contrib_df = self._build_contrib_df( contrib_list, test_mat=test_mat, idx=idx, sample=sample, + feature_stats=fstats, how=how ) @@ -332,16 +334,20 @@ def old_init(self, clf, test_mat, id_col=None, # pre-calcuate feature distribution statistics for each feature self._populate_feature_stats() - def _populate_feature_stats(self): + def _populate_feature_stats(self, test_mat): """Setter function for feature distribution statistics. Pre-calculates the feature distribution information from the test matrix, including type (continuous or binary), mean, median, 5th & 95th percentiles, standard deviation. """ + # TODO: Modified to take in a test matrix, I think the function name should change + fstats = pd.DataFrame(columns=['feature', 'type', 'mean', 'stdev', 'median', 'p5', 'p95']) - dtypes = self.X_test.dtypes + dtypes = test_mat.dtypes + + # TODO: can vectorize? for col in self.column_names: - feat = self.X_test[col] + feat = test_mat[col] d = {'feature': col, 'mean': feat.mean(), 'median': feat.median(), @@ -363,7 +369,8 @@ def _populate_feature_stats(self): fstats = fstats.append(d, ignore_index=True) fstats.set_index('feature', inplace=True) - self.feature_stats = fstats + # self.feature_stats = fstats + return fstats def set_name_patterns(self, name_patterns): """Map regex patterns to column names. @@ -455,7 +462,7 @@ def _build_contrib_df_sample(self, mean_by_trees_list, how): return contrib_df - def _build_contrib_df(self, mean_by_trees_list, test_mat, idx, sample, how='features'): + def _build_contrib_df(self, mean_by_trees_list, test_mat, idx, sample, feature_stats, how='features'): """ Build contribution dataframe. @@ -479,7 +486,7 @@ def _build_contrib_df(self, mean_by_trees_list, test_mat, idx, sample, how='feat contrib_df = contrib_df.join(self.column_patterns, how='inner') contrib_df = contrib_df.groupby(['name_pattern'])['contribution'].sum().to_frame() else: - contrib_df = contrib_df.join(self.feature_stats, how='left') + contrib_df = contrib_df.join(feature_stats, how='left') # lookup the specific example's values for i, col in enumerate(contrib_df.index.values): @@ -490,7 +497,7 @@ def _build_contrib_df(self, mean_by_trees_list, test_mat, idx, sample, how='feat example_value = sample[i] else: if idx is not None: - example_value = self.X_test.loc[idx, col] + example_value = test_mat.loc[idx, col] else: example_value = sample[i] From cc385d77a200dcf3c3f844ca38cd4ff143d4cd6a Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 20 Feb 2020 19:39:52 +0000 Subject: [PATCH 46/80] tested the new contrib_df --- tests/test_new_lorax.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index b645a76..00ca351 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -100,7 +100,10 @@ def test_feature_importances(self): print('Asserting the descriptive feature contributions...') - self.assertEqual(lrx_out, lrx_out2) + # self.assertEqual(lrx_out, lrx_out2) + + print(lrx_out.head()) + print(lrx_out2.head()) def test_scores_lorax(): """New test to check the importance scores and their signs (+) or (-)""" From 5b1c0c786fc4c302cef60d8fdc5a69ac66c77223 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 20 Feb 2020 20:03:04 +0000 Subject: [PATCH 47/80] testing the new data loader --- tests/test_new_lorax.py | 58 +++++++---------------------------------- 1 file changed, 9 insertions(+), 49 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 00ca351..cdb9ac9 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -4,6 +4,7 @@ sys.path.append(project_path) import pandas as pd +# from pandas.testing import assert_frame_equal import numpy as np import random from datetime import datetime @@ -74,16 +75,6 @@ def test_feature_importances(self): graph=False ) - lrx_out2 = lrx.explain_example_new( - sample=None, - test_mat=data, - descriptive=True, - idx=0, - pred_class=pred_class, - num_features=10, - graph=False - ) - feature1_contrib = lrx_out.contribution.loc['feature1'] feature5_contrib = lrx_out.contribution.loc['feature5'] @@ -98,50 +89,19 @@ def test_feature_importances(self): self.assertFalse('feature3' in lrx_out.contribution) - print('Asserting the descriptive feature contributions...') - - # self.assertEqual(lrx_out, lrx_out2) - - print(lrx_out.head()) - print(lrx_out2.head()) - -def test_scores_lorax(): - """New test to check the importance scores and their signs (+) or (-)""" - lrx = TheLorax( - global_clf, + def test_data_loader(self): + """Testing the data loader""" + lrx = TheLorax( + clf=global_clf, column_names=data.columns.values, + test_mat=data, id_col=None, date_col=None, - outcome_col=None) - - # without id_col (zero indexed) - # lrx_out = lrx.explain_example_new(test_mat=data, idx=0, pred_class=1, graph=False) - - sample = data.loc[0].values - lrx_out = lrx.explain_example_new( - sample=sample, - test_mat=None, - descriptive=False, - idx=0, - pred_class=0, - num_features=10, - graph=False) - - lrx.load_dataset(test_mat=data) - - print(self.X_test.head()) - - # print(lrx_out) - - - # feature1_contrib = lrx_out.contribution.loc['feature1'] - # feature5_contrib = lrx_out.contribution.loc['feature5'] - - # print(feature1_contrib, feature5_contrib) + outcome_col=None + ) + pd.testing.assert_frame_equal(data, lrx.X_test) if __name__ == '__main__': - # test_lorax_breast_cancer() unittest.main() - # test_scores_lorax() \ No newline at end of file From 85ceeaf66a66f100162372db8bc067b1a33808de Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 20 Feb 2020 20:53:41 +0000 Subject: [PATCH 48/80] made the feature stats public --- lorax/lorax.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 6bfb60c..cbde6d0 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -133,7 +133,7 @@ def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outco self.X_test["Intercept"] = [self.clf.intercept_[0] for i in range(len(self.X_test))] # pre-calculating the feature distributions - self.feature_stats = self._populate_feature_stats(test_mat=self.X_test) + self.feature_stats = self.populate_feature_stats(test_mat=self.X_test) def explain_example_new(self, sample=None, pred_class=None, @@ -231,7 +231,7 @@ def explain_example_new(self, sample=None, test_mat = self.X_test fstats = self.feature_stats else: - fstats = self._populate_feature_stats(test_mat) + fstats = self.populate_feature_stats(test_mat) contrib_df = self._build_contrib_df( contrib_list, @@ -332,9 +332,10 @@ def old_init(self, clf, test_mat, id_col=None, self.X_test["Intercept"] = [self.clf.intercept_[0] for i in range(len(self.X_test))] # pre-calcuate feature distribution statistics for each feature - self._populate_feature_stats() + self.feature_stats = self.populate_feature_stats() - def _populate_feature_stats(self, test_mat): + # TODO: make protected again. Making public for testing + def populate_feature_stats(self, test_mat): """Setter function for feature distribution statistics. Pre-calculates the feature distribution information from the test matrix, including From 7d5c6d52027463ac68d4a162366ea1656ed6f567 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 20 Feb 2020 20:54:00 +0000 Subject: [PATCH 49/80] tested feature stats and added next test descs --- tests/test_new_lorax.py | 56 ++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index cdb9ac9..8e20011 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -34,10 +34,12 @@ columns = ["entity_id", "as_of_date", "feature1", "feature2", "feature3", "feature4", "feature5", "outcome"] +features = [x for x in columns if x not in ['entity_id', 'as_of_date', 'outcome']] + data = pd.DataFrame(data, columns=columns) # Testing the independence from id_col, date_col, outcome -data = data.drop(['entity_id', 'as_of_date', 'outcome'], axis=1) +# data = data.drop(['entity_id', 'as_of_date', 'outcome'], axis=1) n_estimators = 2 @@ -54,20 +56,21 @@ def test_feature_importances(self): # Setting up lorax lrx = TheLorax( clf=global_clf, - column_names=data.columns.values, - test_mat=None, - id_col=None, - date_col=None, - outcome_col=None) + column_names=features, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') # without id_col (zero indexed) # lrx_out = lrx.explain_example_new(test_mat=data, idx=0, pred_class=1, graph=False) - sample = data.loc[0].values + sample = data.loc[0, features].values + pred_class = 0 # The label w.r.t the explanations are generated lrx_out = lrx.explain_example_new( sample=sample, - test_mat=data, + test_mat=None, descriptive=True, idx=None, pred_class=pred_class, @@ -89,19 +92,44 @@ def test_feature_importances(self): self.assertFalse('feature3' in lrx_out.contribution) - def test_data_loader(self): + def test_feature_stats(self): """Testing the data loader""" + lrx = TheLorax( clf=global_clf, - column_names=data.columns.values, + column_names=features, test_mat=data, - id_col=None, - date_col=None, - outcome_col=None + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome' ) - pd.testing.assert_frame_equal(data, lrx.X_test) + st1 = lrx.populate_feature_stats(data[features]) + + pd.testing.assert_frame_equal(st1, lrx.feature_stats) + + def test_descriptive_explanation_cases(self): + """ + There are different methods to get a descriptive explanation + This test asserts all those methods yield the same answer + """ + pass + + def test_old_vs_new_lorax(self): + """ + Verifying that the new explain method is + generating the same explanations as before + """ + pass + def test_explanation_patterns(self): + """ + Testing whether the explanations interms of + feature patterns are generated correctly + """ + pass if __name__ == '__main__': + print(data.columns.values) unittest.main() + From b41fdca1159f63831afb3dfad8e535a3d67285e0 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 20 Feb 2020 20:55:39 +0000 Subject: [PATCH 50/80] fixed error --- lorax/lorax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index cbde6d0..0c5dc16 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -332,7 +332,7 @@ def old_init(self, clf, test_mat, id_col=None, self.X_test["Intercept"] = [self.clf.intercept_[0] for i in range(len(self.X_test))] # pre-calcuate feature distribution statistics for each feature - self.feature_stats = self.populate_feature_stats() + self.feature_stats = self.populate_feature_stats(test_mat=self.X_test) # TODO: make protected again. Making public for testing def populate_feature_stats(self, test_mat): From 58d03b2e63b6d03610b7f43072b5002fa227133c Mon Sep 17 00:00:00 2001 From: Kasun A Date: Mon, 24 Feb 2020 21:22:09 +0000 Subject: [PATCH 51/80] moved the doc strig --- lorax/lorax.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/lorax/lorax.py b/lorax/lorax.py index 0c5dc16..af7bb3c 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -143,6 +143,49 @@ def explain_example_new(self, sample=None, test_mat=None, idx=None, graph=False): + # TODO: Adapt the docstring to tne new function + """ + Graph or return individual feature importances for an example. + + This method is the primary interface for TheLorax to calculate individual feature + importances for a given example (identified by `idx`). It can be used to either + return a pandas DataFrame with contributions and feature distributions (if + `graph=False`) or a graphical representation of the top `num_features` contributions + (if `graph=True`, the default) for use in a jupyter notebook. + + Feature contributions can be calucalted either for all features separately (`how='features', + the default) or using regular expression patterns to group sets of features together + (`how='patterns'`). When graphing contributions for all features, graphs will contain two + components: + 1. A bar graph of the top num_features contributions to the example's score + 2. For each of these features, a graph showing the percentile for the feature's mean + across the entire test set (gray dot), the percentile of the feature value for the + example being explained (orange dot) and the z-score for that value + When using regular expression patterns, the feature distribution information is omitted + (from both graphical and dataframe outputs) as the contributions reflect aggregations over + an arbitrary number and types of features. + + Arguments: + idx (int) The entity id of the example we want to explain + pred_class (int) The predicted class for the example (currently must be 1 or 0). The + returned feature contributions will be taken relative to the score for this class. + If None (the default), the predicted class will be assigned based on whether the + example's score is above or below a threshold of 0.5. + num_features (int) The number of features with the highest contributions to graph + (ignored if `graph=False` in which case the entire set will be returned) + graph (bool) Whether to graph the feature contributions or return a dataframe + without graphing (default: True) + how (str) Whether to calculate feature contributions at the level of individual features + (`how='features'`, the default) or using regex patterns (`how='patterns'`). + If using regex patterns, `name_patterns` must have been provided when the object + was constructed or through calling `set_name_patterns()`. + + Returns: + If `graph=False`, returns a pandas dataframe with individual feature contributions + and (if using `how='features'`) feature distribution information + + """ + # User has to pass either an index and a test_mat or a samples (a row) if sample is None and (test_mat is None or idx is None): raise ValueError('Must either provide a data sample or a test matrix with a sample index') @@ -633,6 +676,8 @@ def _plot_dists(self, df, num_features, ax): ax.set_title('Feature Distributions', fontsize=16) def explain_example(self, idx, pred_class=None, num_features=10, graph=True, how='features'): + # TODO: This method is now deprecated. So, combine this with the explain_example_new function + # and rename it to explain_example """Graph or return individual feature importances for an example. This method is the primary interface for TheLorax to calculate individual feature From 44a090e2e689ed80c8a3128532d2d1b03c76960f Mon Sep 17 00:00:00 2001 From: Kasun A Date: Mon, 24 Feb 2020 21:34:50 +0000 Subject: [PATCH 52/80] Kits TODO from the old method --- lorax/lorax.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index af7bb3c..0be24c4 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -143,7 +143,7 @@ def explain_example_new(self, sample=None, test_mat=None, idx=None, graph=False): - # TODO: Adapt the docstring to tne new function + # TODO: Adapt the docstring to the new function """ Graph or return individual feature importances for an example. @@ -185,6 +185,10 @@ def explain_example_new(self, sample=None, and (if using `how='features'`) feature distribution information """ + # TODO: Categoricals can be handled using regex patterns, but this currently precludes + # showing feature distribution information (since we don't know how to combine distributions + # for arbitary feature groupings), but if just using patterns for categoricals/imputed flags + # we should still be able to show relevant distribution info... # User has to pass either an index and a test_mat or a samples (a row) if sample is None and (test_mat is None or idx is None): From 3db1d2a3a275b5814bbc36eaa82642f86d7fc608 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Mon, 24 Feb 2020 21:54:13 +0000 Subject: [PATCH 53/80] removed the num_features check --- lorax/lorax.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 0be24c4..defbccc 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -202,6 +202,7 @@ def explain_example_new(self, sample=None, raise ValueError('Must specify name patterns to aggregate over.' + 'Use TheLorax.set_name_patterns() first.') elif how not in ['features', 'patterns']: + # NOTE-KA: Minor, in this case, should we default to features and let the code run with a warning? raise ValueError('How must be one of features or patterns.') # TODO: Add error handling for sample's features and the data features. @@ -310,12 +311,7 @@ def explain_example_new(self, sample=None, ).set_index('feature') ) - if graph: - # NOTE-KA: num features seem to be a unnecessary dependency. - # Defaulting to 10 in the function signature is less general. - # For now adding a error check, if passed value > # features in the df, will plot all values - # But, I feel like it should be handled more elegantly - num_features = min(num_features, contrib_df.shape[0]) + if graph: self._plot_graph(idx, pred_class, score, num_features, contrib_df, how) From 7569b424877857b7ee58316bcff23abe909ca242 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Mon, 24 Feb 2020 21:58:00 +0000 Subject: [PATCH 54/80] removed the old explain instance method and renamed the new_method --- lorax/lorax.py | 135 +++---------------------------------------------- 1 file changed, 7 insertions(+), 128 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index defbccc..54a67ac 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -135,13 +135,13 @@ def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outco # pre-calculating the feature distributions self.feature_stats = self.populate_feature_stats(test_mat=self.X_test) - def explain_example_new(self, sample=None, - pred_class=None, - num_features=10, - how='features', - descriptive=False, - test_mat=None, - idx=None, graph=False): + def explain_example(self, sample=None, + pred_class=None, + num_features=10, + how='features', + descriptive=False, + test_mat=None, + idx=None, graph=False): # TODO: Adapt the docstring to the new function """ @@ -675,127 +675,6 @@ def _plot_dists(self, df, num_features, ax): ax.set_facecolor('white') ax.set_title('Feature Distributions', fontsize=16) - def explain_example(self, idx, pred_class=None, num_features=10, graph=True, how='features'): - # TODO: This method is now deprecated. So, combine this with the explain_example_new function - # and rename it to explain_example - """Graph or return individual feature importances for an example. - - This method is the primary interface for TheLorax to calculate individual feature - importances for a given example (identified by `idx`). It can be used to either - return a pandas DataFrame with contributions and feature distributions (if - `graph=False`) or a graphical representation of the top `num_features` contributions - (if `graph=True`, the default) for use in a jupyter notebook. - - Feature contributions can be calucalted either for all features separately (`how='features', - the default) or using regular expression patterns to group sets of features together - (`how='patterns'`). When graphing contributions for all features, graphs will contain two - components: - 1. A bar graph of the top num_features contributions to the example's score - 2. For each of these features, a graph showing the percentile for the feature's mean - across the entire test set (gray dot), the percentile of the feature value for the - example being explained (orange dot) and the z-score for that value - When using regular expression patterns, the feature distribution information is omitted - (from both graphical and dataframe outputs) as the contributions reflect aggregations over - an arbitrary number and types of features. - - Arguments: - idx (int) The entity id of the example we want to explain - pred_class (int) The predicted class for the example (currently must be 1 or 0). The - returned feature contributions will be taken relative to the score for this class. - If None (the default), the predicted class will be assigned based on whether the - example's score is above or below a threshold of 0.5. - num_features (int) The number of features with the highest contributions to graph - (ignored if `graph=False` in which case the entire set will be returned) - graph (bool) Whether to graph the feature contributions or return a dataframe - without graphing (default: True) - how (str) Whether to calculate feature contributions at the level of individual features - (`how='features'`, the default) or using regex patterns (`how='patterns'`). - If using regex patterns, `name_patterns` must have been provided when the object - was constructed or through calling `set_name_patterns()`. - - Returns: - If `graph=False`, returns a pandas dataframe with individual feature contributions - and (if using `how='features'`) feature distribution information - - """ - # TODO: Categoricals can be handled using regex patterns, but this currently precludes - # showing feature distribution information (since we don't know how to combine distributions - # for arbitary feature groupings), but if just using patterns for categoricals/imputed flags - # we should still be able to show relevant distribution info... - - if how == 'patterns' and self.column_patterns is None: - raise ValueError('Must specify name patterns to aggregate over.' + - 'Use TheLorax.set_name_patterns() first.') - elif how not in ['features', 'patterns']: - raise ValueError('How must be one of features or patterns.') - - # If we have MultiIndex, we need to sort - if self.combined_index: - self.preds.sort_index(level=0, inplace=True) - self.X_test.sort_index(level=0, inplace=True) - - # score for this example for the positive class - # using threshold of 0.5 if pred_class is not given as an argument - score = self.preds.loc[idx, 'pred'] - if pred_class is None: - pred_class = int(score >= 0.5) - - # feature values for this example - sample = self.X_test.loc[idx, ].values - if self.combined_index: - sample = sample[0] - - if isinstance(self.clf, RandomForestClassifier): - # Getting values for Random Forest Classifier - return_tuple = get_contrib_list_RF(self.clf, sample, self.column_names) - - self.num_trees = return_tuple[0] - self.global_score_dict = return_tuple[1] - self.feature_dict = return_tuple[2] - self.aggregated_dict = return_tuple[3] - contrib_list = return_tuple[4] - - elif isinstance(self.clf, LogisticRegression): - # Getting values for Random Forest Classifier - contrib_list = get_contrib_list_LR(self.clf, sample, self.column_names) - - # TODO: handle this more elegantly for multiclass problems - # We need to flip the sign of the scores. - if pred_class == 0: - score = 1.0 - score - contrib_list = [(feature, score * -1) for feature, score in contrib_list] - - logging.info('Used predicted class {} for example {}, score={}'.format(pred_class, - idx, - score)) - - # sorting in descending order by contribution then by feature name in the case of ties - contrib_list.sort(key=lambda x: (x[1] * -1, x[0])) - - # drop the results into a dataframe to append on other information - contrib_df = self._build_contrib_df(contrib_list, idx, how) - - # adding overall feature importance from model level - overall_importance = [] - for i in range(len(self.column_names)): - if isinstance(self.clf, LogisticRegression): - overall_importance.append((self.column_names[i], self.clf.coef_[0][i])) - else: - overall_importance.append((self.column_names[i], self.clf.feature_importances_[i])) - - updated_list = add_overall_feature_importance(contrib_list, - overall_importance) - updated_columns = ['feature', 'sample_rank', 'overall_imp', 'overall_rank', 'rank_change'] - - contrib_df = contrib_df.join(pd.DataFrame(data=updated_list, - columns=updated_columns).set_index('feature')) - - if graph: - self._plot_graph(idx, pred_class, score, - num_features, contrib_df, how) - else: - return contrib_df - def speak_for_the_trees(self, id, pred_class=None, num_features=20, graph=True, how='features'): """Explain an example's score. From 80e6c91723900ae17a744af760eec9fcb4284650 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Mon, 24 Feb 2020 23:16:28 +0000 Subject: [PATCH 55/80] fixed the new explanation method to consider the pre-loaded dataset --- lorax/lorax.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 0c5dc16..d6495df 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -144,7 +144,7 @@ def explain_example_new(self, sample=None, idx=None, graph=False): # User has to pass either an index and a test_mat or a samples (a row) - if sample is None and (test_mat is None or idx is None): + if sample is None and ((test_mat is None and self.X_test is None) or idx is None): raise ValueError('Must either provide a data sample or a test matrix with a sample index') # A test matrix is necessary for getting descriptive stats @@ -161,6 +161,9 @@ def explain_example_new(self, sample=None, if isinstance(sample, pd.Series): sample = sample.values + if self.X_test is not None and idx is not None: + sample = self.X_test.loc[idx].values + # Formatting the test data matrix by setting appropriate index and removing non-feature coulmns if test_mat is not None: # Indexing and exclusing non-feature columns @@ -728,7 +731,9 @@ def explain_example(self, idx, pred_class=None, num_features=10, graph=True, how contrib_list.sort(key=lambda x: (x[1] * -1, x[0])) # drop the results into a dataframe to append on other information - contrib_df = self._build_contrib_df(contrib_list, idx, how) + contrib_df = self._build_contrib_df(contrib_list, + test_mat=self.X_test, + idx=idx, sample=sample, feature_stats=self.feature_stats, how=how) # adding overall feature importance from model level overall_importance = [] From 4e5f47161761095b1363fa88d89722950deaa4f6 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Mon, 24 Feb 2020 23:18:14 +0000 Subject: [PATCH 56/80] reordered the arguments to reflect the correlations --- lorax/lorax.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index d6495df..ead242d 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -137,11 +137,13 @@ def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outco def explain_example_new(self, sample=None, pred_class=None, - num_features=10, - how='features', - descriptive=False, test_mat=None, - idx=None, graph=False): + idx=None, + descriptive=False, + graph=False, + num_features=10, + how='features' + ): # User has to pass either an index and a test_mat or a samples (a row) if sample is None and ((test_mat is None and self.X_test is None) or idx is None): From efd15e776c8bc47282a13b3123ebb7f29cf59d7d Mon Sep 17 00:00:00 2001 From: Kasun A Date: Mon, 24 Feb 2020 23:24:24 +0000 Subject: [PATCH 57/80] tested the outputs from new and old --- tests/test_new_lorax.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 8e20011..6df122a 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -120,7 +120,37 @@ def test_old_vs_new_lorax(self): Verifying that the new explain method is generating the same explanations as before """ - pass + lrx = TheLorax( + clf=global_clf, + column_names=features, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome' + ) + + pred_class = 0 # The label w.r.t the explanations are generated + idx = 2 + lrx_out_new = lrx.explain_example_new( + sample=None, + test_mat=None, + descriptive=True, + idx=idx, + pred_class=pred_class, + num_features=10, + graph=False + ) + + lrx_out_old = lrx.explain_example( + idx=idx, + pred_class=pred_class, + num_features=10, + graph=False, + how='features' + ) + + pd.testing.assert_frame_equal(lrx_out_new, lrx_out_old) + def test_explanation_patterns(self): """ From 19806a74d00c525edb6be6bf4b13d2a3ce0e7723 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Tue, 25 Feb 2020 19:49:31 +0000 Subject: [PATCH 58/80] mergerd the old and the new functions --- tests/test_new_lorax.py | 65 ++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 6df122a..c95946f 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -119,37 +119,42 @@ def test_old_vs_new_lorax(self): """ Verifying that the new explain method is generating the same explanations as before - """ - lrx = TheLorax( - clf=global_clf, - column_names=features, - test_mat=data, - id_col='entity_id', - date_col='as_of_date', - outcome_col='outcome' - ) - - pred_class = 0 # The label w.r.t the explanations are generated - idx = 2 - lrx_out_new = lrx.explain_example_new( - sample=None, - test_mat=None, - descriptive=True, - idx=idx, - pred_class=pred_class, - num_features=10, - graph=False - ) - lrx_out_old = lrx.explain_example( - idx=idx, - pred_class=pred_class, - num_features=10, - graph=False, - how='features' - ) - - pd.testing.assert_frame_equal(lrx_out_new, lrx_out_old) + Note: This test was deprecated after verufying that the new explain instance + returned the same results as the old one. + The old method was emoved from the class + """ + pass + # lrx = TheLorax( + # clf=global_clf, + # column_names=features, + # test_mat=data, + # id_col='entity_id', + # date_col='as_of_date', + # outcome_col='outcome' + # ) + + # pred_class = 0 # The label w.r.t the explanations are generated + # idx = 2 + # lrx_out_new = lrx.explain_example( + # sample=None, + # test_mat=None, + # descriptive=True, + # idx=idx, + # pred_class=pred_class, + # num_features=10, + # graph=False + # ) + + # lrx_out_old = lrx.explain_example( + # idx=idx, + # pred_class=pred_class, + # num_features=10, + # graph=False, + # how='features' + # ) + + # pd.testing.assert_frame_equal(lrx_out_new, lrx_out_old) def test_explanation_patterns(self): From 6ca04464fdc1047afdb0cb9acd3c04f9826df70d Mon Sep 17 00:00:00 2001 From: Kasun A Date: Tue, 25 Feb 2020 21:02:18 +0000 Subject: [PATCH 59/80] fixed error and created a bogus categorical column --- tests/test_new_lorax.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index c95946f..39c9385 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -68,7 +68,7 @@ def test_feature_importances(self): sample = data.loc[0, features].values pred_class = 0 # The label w.r.t the explanations are generated - lrx_out = lrx.explain_example_new( + lrx_out = lrx.explain_example( sample=sample, test_mat=None, descriptive=True, @@ -77,7 +77,7 @@ def test_feature_importances(self): num_features=10, graph=False ) - + feature1_contrib = lrx_out.contribution.loc['feature1'] feature5_contrib = lrx_out.contribution.loc['feature5'] @@ -162,9 +162,34 @@ def test_explanation_patterns(self): Testing whether the explanations interms of feature patterns are generated correctly """ - pass + + # Creating the data with cateorical columns to have regex feature patterns + X2, y2 = datasets.make_classification(n_samples=10000, n_features=6, + n_informative=3, n_redundant=2, + random_state=42) + + data2 = np.append(X2, y2.reshape(y2.shape[0], 1), axis=1) + columns2 = ["feature1", "feature2", + "feature3", "feature4", "feature5", + "category", "outcome"] + data2 = pd.DataFrame(data2, columns=columns2) + + # Creating the categorical features + data2['category']= pd.cut( + data2['category'], + bins=2, + labels=['a','b'] + ) + + data2 = pd.get_dummies(data2, columns=['category']) + + features2 = [x for x in data2.columns.values if x not in ['entity_id', 'as_of_date', 'outcome']] + + + + if __name__ == '__main__': - print(data.columns.values) + print(features) unittest.main() From da6e4d441a7f72d79c4b1a940312942eb32e55fe Mon Sep 17 00:00:00 2001 From: Kasun A Date: Tue, 25 Feb 2020 22:31:38 +0000 Subject: [PATCH 60/80] added the set name patterns --- lorax/lorax.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 9727f82..1f1cb74 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -57,6 +57,10 @@ def __init__(self, clf, column_names, test_mat=None, column_patterns=None, id_co self.column_names = column_names self.column_patterns = column_patterns + # Register the regex patterns and associated columns if using + if column_patterns is not None: + self.set_name_patterns(column_patterns) + # NOTE-KA: I feel like the method should be independent of these as these seem very triage specific. # We can always have a script that bridges the triage data with the explain API self.id_col = id_col @@ -70,7 +74,7 @@ def __init__(self, clf, column_names, test_mat=None, column_patterns=None, id_co # if outcome_col is not None: # self.drop_cols.append(outcome_col) - + self.combined_index = False if id_col is not None: if type(id_col) in [list, tuple]: From d3edece7f71abc325f01ac0291f07595cbd8c021 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Tue, 25 Feb 2020 22:32:28 +0000 Subject: [PATCH 61/80] added method to test pattern features --- tests/test_new_lorax.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 39c9385..0c3f847 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -185,11 +185,40 @@ def test_explanation_patterns(self): features2 = [x for x in data2.columns.values if x not in ['entity_id', 'as_of_date', 'outcome']] + n_estimators = 10 + max_depth = 4 + clf = RandomForestClassifier( + n_estimators=n_estimators, + max_depth=max_depth, + random_state=42).fit(data2[features2].values, y2) + + lrx = TheLorax( + clf=clf, + column_names=features2, + column_patterns=['feature', 'category_'], + test_mat=data2, + id_col=None, + date_col=None, + outcome_col='outcome' + ) + + sample = data2.loc[0, features2].values + pred_class = 0 # The label w.r.t the explanations are generate + + lrx_out = lrx.explain_example( + sample=sample, + test_mat=None, + descriptive=True, + idx=None, + pred_class=pred_class, + num_features=10, + graph=False, + how='patterns' + ) - + print(lrx_out) if __name__ == '__main__': - print(features) unittest.main() From 9cbdb583023090818192c05ba220813328595efd Mon Sep 17 00:00:00 2001 From: Kasun A Date: Tue, 25 Feb 2020 22:41:24 +0000 Subject: [PATCH 62/80] added the old explain method --- lorax/lorax.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/lorax/lorax.py b/lorax/lorax.py index 1f1cb74..07ba710 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -682,6 +682,81 @@ def _plot_dists(self, df, num_features, ax): ax.set_facecolor('white') ax.set_title('Feature Distributions', fontsize=16) + def explain_example_old(self, idx, pred_class=None, num_features=10, graph=True, how='features'): + if how == 'patterns' and self.column_patterns is None: + raise ValueError('Must specify name patterns to aggregate over.' + + 'Use TheLorax.set_name_patterns() first.') + elif how not in ['features', 'patterns']: + raise ValueError('How must be one of features or patterns.') + + # If we have MultiIndex, we need to sort + if self.combined_index: + self.preds.sort_index(level=0, inplace=True) + self.X_test.sort_index(level=0, inplace=True) + + # score for this example for the positive class + # using threshold of 0.5 if pred_class is not given as an argument + score = self.preds.loc[idx, 'pred'] + if pred_class is None: + pred_class = int(score >= 0.5) + + # feature values for this example + sample = self.X_test.loc[idx, ].values + if self.combined_index: + sample = sample[0] + + if isinstance(self.clf, RandomForestClassifier): + # Getting values for Random Forest Classifier + return_tuple = get_contrib_list_RF(self.clf, sample, self.column_names) + + self.num_trees = return_tuple[0] + self.global_score_dict = return_tuple[1] + self.feature_dict = return_tuple[2] + self.aggregated_dict = return_tuple[3] + contrib_list = return_tuple[4] + + elif isinstance(self.clf, LogisticRegression): + # Getting values for Random Forest Classifier + contrib_list = get_contrib_list_LR(self.clf, sample, self.column_names) + + # TODO: handle this more elegantly for multiclass problems + # We need to flip the sign of the scores. + if pred_class == 0: + score = 1.0 - score + contrib_list = [(feature, score * -1) for feature, score in contrib_list] + + logging.info('Used predicted class {} for example {}, score={}'.format(pred_class, + idx, + score)) + + # sorting in descending order by contribution then by feature name in the case of ties + contrib_list.sort(key=lambda x: (x[1] * -1, x[0])) + + # drop the results into a dataframe to append on other information + contrib_df = self._build_contrib_df(contrib_list, idx, how) + + # adding overall feature importance from model level + overall_importance = [] + for i in range(len(self.column_names)): + if isinstance(self.clf, LogisticRegression): + overall_importance.append((self.column_names[i], self.clf.coef_[0][i])) + else: + overall_importance.append((self.column_names[i], self.clf.feature_importances_[i])) + + updated_list = add_overall_feature_importance(contrib_list, + overall_importance) + updated_columns = ['feature', 'sample_rank', 'overall_imp', 'overall_rank', 'rank_change'] + + contrib_df = contrib_df.join(pd.DataFrame(data=updated_list, + columns=updated_columns).set_index('feature')) + + if graph: + self._plot_graph(idx, pred_class, score, + num_features, contrib_df, how) + else: + return contrib_df + + def speak_for_the_trees(self, id, pred_class=None, num_features=20, graph=True, how='features'): """Explain an example's score. From 38d06b6573a20d80cd6e2758cee526b36fc5a092 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 26 Feb 2020 15:13:28 +0000 Subject: [PATCH 63/80] adapted the old explaner contrib_df --- lorax/lorax.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 07ba710..e7a1e2f 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -733,7 +733,12 @@ def explain_example_old(self, idx, pred_class=None, num_features=10, graph=True, contrib_list.sort(key=lambda x: (x[1] * -1, x[0])) # drop the results into a dataframe to append on other information - contrib_df = self._build_contrib_df(contrib_list, idx, how) + contrib_df = self._build_contrib_df( + contrib_list, test_mat=self.X_test, + sample=sample, + feature_stats=self.feature_stats, + idx=idx, how=how + ) # adding overall feature importance from model level overall_importance = [] From 89535b9fb4766fc49e74273754b7f2bdf195b80f Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 26 Feb 2020 16:16:36 +0000 Subject: [PATCH 64/80] tests to compare old and new --- tests/test_new_lorax.py | 82 ++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 34 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 0c3f847..97b5bfc 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -125,36 +125,38 @@ def test_old_vs_new_lorax(self): The old method was emoved from the class """ pass - # lrx = TheLorax( - # clf=global_clf, - # column_names=features, - # test_mat=data, - # id_col='entity_id', - # date_col='as_of_date', - # outcome_col='outcome' - # ) - - # pred_class = 0 # The label w.r.t the explanations are generated - # idx = 2 - # lrx_out_new = lrx.explain_example( - # sample=None, - # test_mat=None, - # descriptive=True, - # idx=idx, - # pred_class=pred_class, - # num_features=10, - # graph=False - # ) - - # lrx_out_old = lrx.explain_example( - # idx=idx, - # pred_class=pred_class, - # num_features=10, - # graph=False, - # how='features' - # ) - - # pd.testing.assert_frame_equal(lrx_out_new, lrx_out_old) + lrx = TheLorax( + clf=global_clf, + column_names=features, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome' + ) + + pred_class = 0 # The label w.r.t the explanations are generated + idx = 2 + lrx_out_new = lrx.explain_example( + sample=None, + test_mat=None, + descriptive=True, + idx=idx, + pred_class=pred_class, + num_features=10, + graph=False + ) + + lrx_out_old = lrx.explain_example_old( + idx=idx, + pred_class=pred_class, + num_features=10, + graph=False, + how='features' + ) + + pd.testing.assert_frame_equal(lrx_out_new, lrx_out_old) + + print() def test_explanation_patterns(self): @@ -195,28 +197,40 @@ def test_explanation_patterns(self): lrx = TheLorax( clf=clf, column_names=features2, - column_patterns=['feature', 'category_'], + column_patterns=['feature'], test_mat=data2, id_col=None, date_col=None, outcome_col='outcome' ) + idx = 0 sample = data2.loc[0, features2].values pred_class = 0 # The label w.r.t the explanations are generate lrx_out = lrx.explain_example( - sample=sample, + sample=None, test_mat=None, descriptive=True, - idx=None, + idx=idx, pred_class=pred_class, num_features=10, graph=False, how='patterns' ) + + lrx_out_old = lrx.explain_example_old( + idx=idx, + pred_class=pred_class, + num_features=10, + graph=False, + how='patterns' + ) - print(lrx_out) + print(lrx_out) + print(lrx_out_old) + + pd.testing.assert_frame_equal(lrx_out, lrx_out_old) if __name__ == '__main__': From ba251c9ca1b656a5bfda052d578e3352e0ca9ef5 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 26 Feb 2020 16:51:57 +0000 Subject: [PATCH 65/80] docstring --- lorax/lorax.py | 72 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index e7a1e2f..64a165c 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -97,14 +97,16 @@ def __init__(self, clf, column_names, test_mat=None, column_patterns=None, id_co def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outcome_col=None): """ Loading a test dataset to the object. This dataset can be used to provide context to the individual predition explanations. - Context entails observing feature distributions + Context, in this case, are the feature distributions - param test_mat: A pandas dataframe - param id_col: The name(s) of the columns to uniquely identify an instance (entity_id in triage land) - param date_col: - param outcome_col: + Args: + test_mat: A pandas dataframe containing the dataset + id_col: The name(s) of the columns to uniquely identify an instance (entity_id in triage land) + date_col: Name of the column that has the date information + outcome_col: If the dataframe contains the target output, the column name of the target - return: None + return: + None """ df = test_mat.copy() @@ -139,25 +141,25 @@ def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outco # pre-calculating the feature distributions self.feature_stats = self.populate_feature_stats(test_mat=self.X_test) - def explain_example(self, sample=None, + def explain_example(self, + sample=None, + descriptive=False, + test_mat=None, + idx=None, pred_class=None, + graph=False, num_features=10, - how='features', - descriptive=False, - test_mat=None, - idx=None, graph=False): - - # TODO: Adapt the docstring to the new function + how='features'): """ Graph or return individual feature importances for an example. This method is the primary interface for TheLorax to calculate individual feature - importances for a given example (identified by `idx`). It can be used to either + importances for a given example. The function can be used to either return a pandas DataFrame with contributions and feature distributions (if `graph=False`) or a graphical representation of the top `num_features` contributions - (if `graph=True`, the default) for use in a jupyter notebook. + (if `graph=True`) for use in a jupyter notebook. - Feature contributions can be calucalted either for all features separately (`how='features', + Feature contributions can be calucalted either for all features separately (`how='features'`, the default) or using regular expression patterns to group sets of features together (`how='patterns'`). When graphing contributions for all features, graphs will contain two components: @@ -170,19 +172,33 @@ def explain_example(self, sample=None, an arbitrary number and types of features. Arguments: - idx (int) The entity id of the example we want to explain - pred_class (int) The predicted class for the example (currently must be 1 or 0). The - returned feature contributions will be taken relative to the score for this class. - If None (the default), the predicted class will be assigned based on whether the - example's score is above or below a threshold of 0.5. - num_features (int) The number of features with the highest contributions to graph + sample (array): The instance that is to be explained. If this is None, a test matrix and a sample index should be provided + + descriptive (bool): Whether to accompany the explanations with feature distribution data of a test dataset. + Gives more context to the feature important scores. + To be used, a test dataset should be availabe to the function through `test_mat` or `load_dataset()`. + If not set, only the individual feature importance scores will be returned to the user. + + test_mat (pd.DataFrame): A test dataset to be used for descriptive explanations. If provided, this will override any dataset preloaded using `load_dataset()` + + idx (int): The index---w.r.t to the test dataset provided through `test_mat` or `load_dataset()`---of the example we want to explain. + If both, a sample and an index are provided, sample will be ignored + + pred_class (int): The predicted class for the example (currently must be 1 or 0). The + returned feature contributions will be taken relative to the score for this class. + If None (the default), the predicted class will be assigned based on whether the + example's score is above or below a threshold of 0.5. + + graph (bool): Whether to graph the feature contributions or return a dataframe + without graphing (default: False) + + num_features (int): The number of features with the highest contributions to graph (ignored if `graph=False` in which case the entire set will be returned) - graph (bool) Whether to graph the feature contributions or return a dataframe - without graphing (default: True) - how (str) Whether to calculate feature contributions at the level of individual features - (`how='features'`, the default) or using regex patterns (`how='patterns'`). - If using regex patterns, `name_patterns` must have been provided when the object - was constructed or through calling `set_name_patterns()`. + + how (str): Whether to calculate feature contributions at the level of individual features + (`how='features'`, the default) or using regex patterns (`how='patterns'`). + If using regex patterns, `name_patterns` must have been provided when the object + as constructed or through calling `set_name_patterns()`. Returns: If `graph=False`, returns a pandas dataframe with individual feature contributions From b4d18f4889c53ade850dd0c91610e2187929e197 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 26 Feb 2020 20:17:21 +0000 Subject: [PATCH 66/80] docstring of the class --- lorax/lorax.py | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 64a165c..f1da588 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -34,22 +34,32 @@ class TheLorax(object): with other types of models and problems (regression/multinomial classification) Args: - clf (sklearn classifier) The classifier to be explained, e.g., + clf (sklearn classifier): The classifier to be explained, e.g., sklearn.ensemble.RandomForestClassifier - test_mat (pandas.DataFrame) The test matrix containing all examples to be - explained. If `id_col=None` (the default), the id for referencing entities - must be set as this dataframe's index. - id_col (str) The column name for the entity id in the test matrix. If `None` - (the default), the test matrix must be indexed by the entity id. - date_col (str) The date column in the matrix (default: `as_of_date`) - outcome_col (str) The outcome column in the matrix (default: `outcome`). To - indicate that the test matrix has no labels, set `outcome_col=None`. - name_patterns (list) An optional list of regex patterns or compiled regex + The classifier should be already trained. + + column_names (List(str)): The input feature names of the data for which the classifier was trained + + column_patterns (list): An optional list of regex patterns or compiled regex objects to group together features for reporting contributions. If using, each feature name in the test matrix must match one and only one pattern. + + test_mat (pandas.DataFrame): A test matrix to be pre-loaded to the object. This dataset will be used for providing feature distributions to the individual + If `id_col=None` (the default), the id for referencing entities + must be set as this dataframe's index. + + id_col (str): The column name for the entity id in the test matrix. If `None` + (the default), the test matrix must be indexed by the entity id. + + date_col (str): The date column in the matrix (default: None). + If None, the matrix shouldn't contain a column with date + + outcome_col (str): The outcome column in the matrix (default: None). + To indicate that the test matrix has no labels, set `outcome_col=None`. + """ - def __init__(self, clf, column_names, test_mat=None, column_patterns=None, id_col=None, date_col=None, outcome_col=None): + def __init__(self, clf, column_names, column_patterns=None, test_mat=None, id_col=None, date_col=None, outcome_col=None): self.clf = clf # NOTE: Minor. maybe this should be feature_names and feature_patterns @@ -63,18 +73,11 @@ def __init__(self, clf, column_names, test_mat=None, column_patterns=None, id_co # NOTE-KA: I feel like the method should be independent of these as these seem very triage specific. # We can always have a script that bridges the triage data with the explain API + # Leaving this decoupling to another PR self.id_col = id_col self.date_col = date_col self.outcome_col = outcome_col - # TODO: Move this to the load dataset - # self.drop_cols = [] - # if date_col is not None and date_col not in id_col: - # self.drop_cols.append(date_col) - - # if outcome_col is not None: - # self.drop_cols.append(outcome_col) - self.combined_index = False if id_col is not None: if type(id_col) in [list, tuple]: @@ -96,8 +99,7 @@ def __init__(self, clf, column_names, test_mat=None, column_patterns=None, id_co def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outcome_col=None): """ Loading a test dataset to the object. - This dataset can be used to provide context to the individual predition explanations. - Context, in this case, are the feature distributions + This dataset can be used to suppliment individual feature importances with feature distribution stats Args: test_mat: A pandas dataframe containing the dataset @@ -171,7 +173,7 @@ def explain_example(self, (from both graphical and dataframe outputs) as the contributions reflect aggregations over an arbitrary number and types of features. - Arguments: + Args: sample (array): The instance that is to be explained. If this is None, a test matrix and a sample index should be provided descriptive (bool): Whether to accompany the explanations with feature distribution data of a test dataset. @@ -347,6 +349,7 @@ def explain_example(self, def old_init(self, clf, test_mat, id_col=None, date_col='as_of_date', outcome_col='outcome', name_patterns=None): + # TODO: This method should be removed after verifying that the new init compares """ Initialize Lorax. From 586a518536e825a509ec02e980e46ac05ea8dd1b Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 26 Feb 2020 20:21:27 +0000 Subject: [PATCH 67/80] doc string --- lorax/lorax.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index f1da588..b06f8cc 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -405,10 +405,15 @@ def old_init(self, clf, test_mat, id_col=None, # TODO: make protected again. Making public for testing def populate_feature_stats(self, test_mat): - """Setter function for feature distribution statistics. - - Pre-calculates the feature distribution information from the test matrix, including + """ + Pre-calculates the feature distribution information from a test matrix, including type (continuous or binary), mean, median, 5th & 95th percentiles, standard deviation. + + Args: + test_mat (pandas.DataFrame): THe test matrix in a dataframe form + + return: + A dataframe indexed by features containing feature distribution information """ # TODO: Modified to take in a test matrix, I think the function name should change From d80652c04208753ed2340d671f6850482606937e Mon Sep 17 00:00:00 2001 From: Kasun A Date: Wed, 26 Feb 2020 20:22:49 +0000 Subject: [PATCH 68/80] comments --- tests/test_new_lorax.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 97b5bfc..19d00ff 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -206,7 +206,7 @@ def test_explanation_patterns(self): idx = 0 sample = data2.loc[0, features2].values - pred_class = 0 # The label w.r.t the explanations are generate + pred_class = 0 # The label w.r.t the explanations are generated lrx_out = lrx.explain_example( sample=None, @@ -226,10 +226,8 @@ def test_explanation_patterns(self): graph=False, how='patterns' ) - - print(lrx_out) - print(lrx_out_old) + # Assreting that both methods yield the same answer pd.testing.assert_frame_equal(lrx_out, lrx_out_old) From 3218798e264be4d34119015a00c44abb068480bf Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 27 Feb 2020 18:39:17 +0000 Subject: [PATCH 69/80] error handling for date-col and column names --- lorax/lorax.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index b06f8cc..37c540f 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -120,7 +120,7 @@ def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outco self.combined_index = True # exclude non-feature columns (date, outcome if present) - if date_col not in id_col: + if (date_col is not None) and (date_col not in id_col): self.drop_cols.append(date_col) if outcome_col is not None: @@ -140,6 +140,9 @@ def load_dataset(self, test_mat: pd.DataFrame, id_col=None, date_col=None, outco if hasattr(self.clf, 'intercept_'): self.X_test["Intercept"] = [self.clf.intercept_[0] for i in range(len(self.X_test))] + # Making sure the column names match the test matrices columns + self.column_names = self.X_test.columns.values + # pre-calculating the feature distributions self.feature_stats = self.populate_feature_stats(test_mat=self.X_test) From 87cd208fd96ddcb031c506a9e9f93728bfb956da Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 27 Feb 2020 18:48:46 +0000 Subject: [PATCH 70/80] updated the feature contribution and aggregated_dict tests to the new definitions --- tests/test_lorax.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/tests/test_lorax.py b/tests/test_lorax.py index 85eca86..d6ee3c2 100644 --- a/tests/test_lorax.py +++ b/tests/test_lorax.py @@ -1,4 +1,9 @@ """Tests for Lorax.""" +# TODO: Figure out how to do this optimally +import os +import sys +project_path = os.path.join(os.path.dirname(__file__), '../') +sys.path.append(project_path) import random import unittest @@ -10,7 +15,7 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression -from lorax import TheLorax +from lorax.lorax import TheLorax from lorax.utils import add_overall_feature_importance random.seed(42) @@ -48,7 +53,11 @@ class TestLorax(unittest.TestCase): def test_calculated_feature_importances(self): """Test calculated feature importances.""" # Setting up lorax - lrx = TheLorax(global_clf, data, id_col='entity_id') + lrx = TheLorax(clf=global_clf, column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') lrx_out = lrx.explain_example(idx=1, pred_class=1, graph=False) feature1_contrib = lrx_out.contribution.loc['feature1'] @@ -60,7 +69,11 @@ def test_calculated_feature_importances(self): self.assertFalse('feature3' in lrx_out.contribution) def test_aggregated_dict(self): - """Test aggregated_dict.""" + """ + Test aggregated_dict. In the modified version the + aggregated dict is an element of a dictionary named model_info + + """ n_estimators = 5 max_depth = 1 @@ -71,14 +84,19 @@ def test_aggregated_dict(self): clf = clf.fit(X, y) # Setting up lorax - lrx = TheLorax(clf, data, id_col='entity_id') + lrx = TheLorax(clf, column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') _ = lrx.explain_example(idx=1, pred_class=1, graph=False) # Max depth is 1. Number of split_occurences must be equal to # occurences_in_n_trees. - for feature in lrx.aggregated_dict: - split_occ = lrx.aggregated_dict[feature]['diff_list']['split_occurences'] - occ_trees = lrx.aggregated_dict[feature]['mean_diff_list']['occurences_in_n_trees'] + aggregated_dict = lrx.model_info['aggregated_dict'] + for feature in aggregated_dict: + split_occ = aggregated_dict[feature]['diff_list']['split_occurences'] + occ_trees = aggregated_dict[feature]['mean_diff_list']['occurences_in_n_trees'] self.assertEqual(split_occ, occ_trees) def test_logistic_regression_importances(self): From 4cccbd40186bb387b65da0fb2b99c8d41edeb2c5 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 27 Feb 2020 20:22:26 +0000 Subject: [PATCH 71/80] handled the intercept in the data sample --- lorax/lorax.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 37c540f..e088399 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -235,7 +235,15 @@ def explain_example(self, sample = sample.values if self.X_test is not None and idx is not None: - sample = self.X_test.loc[idx].values + sample = self.X_test.loc[idx] + + # When creating X_test in load data, + # if the classifier contains a Intercept, it is added as a feature. + # Have to remove it before the inference phase + if 'Intercept' in sample.index.values: + sample = sample.drop(['Intercept']) + + sample = sample.values # Formatting the test data matrix by setting appropriate index and removing non-feature coulmns if test_mat is not None: @@ -269,6 +277,7 @@ def explain_example(self, elif isinstance(self.clf, LogisticRegression): # Getting values for Random Forest Classifier # TODO: The column names need to be consolidated + contrib_list = get_contrib_list_LR(self.clf, sample, self.column_names) # Setting the prediction class From 117a261f83cd3062ede0d0df54c38f132d8f376d Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 27 Feb 2020 20:41:12 +0000 Subject: [PATCH 72/80] adaptaed all tests for the new template --- tests/test_lorax.py | 57 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/tests/test_lorax.py b/tests/test_lorax.py index d6ee3c2..2c62eeb 100644 --- a/tests/test_lorax.py +++ b/tests/test_lorax.py @@ -7,6 +7,7 @@ import random import unittest +from unittest import TestSuite import numpy as np import pandas as pd from datetime import datetime @@ -106,7 +107,12 @@ def test_logistic_regression_importances(self): clf.fit(X, y) # Setting up lorax - lrx = TheLorax(clf, data, id_col='entity_id') + lrx = TheLorax(clf, column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + lrx_out = lrx.explain_example(idx=1, pred_class=1, graph=False) feature1_contrib = lrx_out.contribution.loc['feature1'] @@ -125,7 +131,10 @@ def test_logistic_regression_importances(self): self.assertEqual(lrx_pred, lr_pred) def test_size_global_dict(self): - """Test the size of the global dict.""" + """ + Test the size of the global dict. Part of the model_info dictionary + + """ n_estimators = 3 max_depth = 1 @@ -136,21 +145,27 @@ def test_size_global_dict(self): clf = clf.fit(X, y) # Setting up lorax - lrx = TheLorax(clf, data, id_col='entity_id') + lrx = TheLorax(clf, column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + _ = lrx.explain_example(idx=1, pred_class=1, graph=False) # Checking if there as many entries, i.e., trees in # global_score_dict as number of estimators in forest - self.assertEqual(len(lrx.global_score_dict), n_estimators) + global_score_dict = lrx.model_info['global_score_dict'] + self.assertEqual(len(global_score_dict), n_estimators) # Checking if every dict entry, i.e., tree has max_depth keys # Since max_depth=1, every tree dict should have only one entry for i in range(n_estimators): - self.assertEqual(len(lrx.global_score_dict[i]), 1) + self.assertEqual(len(global_score_dict[i]), 1) # Checking if dicts for only feature in tree do not # have more than one entry - for tree_idx, feat_dict in lrx.global_score_dict.items(): + for tree_idx, feat_dict in global_score_dict.items(): self.assertEqual(len(feat_dict), 1) def test_add_overall_feature_importance(self): @@ -166,8 +181,18 @@ def test_add_overall_feature_importance(self): self.assertTupleEqual(true_result[i], result[i]) # Setting up lorax - lrx = TheLorax(global_clf, data, id_col='entity_id') - lrx_out = lrx.explain_example(idx=1, pred_class=1, graph=False) + lrx = TheLorax(global_clf, column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + + lrx_out = lrx.explain_example( + idx=1, + pred_class=1, + graph=False, + descriptive=True + ) feature1_overall_imp = global_clf.feature_importances_[0] @@ -179,7 +204,12 @@ def test_multiple_rows_per_entity_id(self): """Test support of multiple rows per entity_id.""" # Setting up lorax # Getting output on test matrix with one row per entity_id - lrx = TheLorax(global_clf, data, id_col='entity_id') + lrx = TheLorax(global_clf, column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + lrx_out = lrx.explain_example(idx=1, pred_class=1, graph=False) # Changing test matrix so that the second row belongs @@ -189,7 +219,12 @@ def test_multiple_rows_per_entity_id(self): # Checking that the output for original row of entity 1 # remains the same when using combined index - lrx = TheLorax(global_clf, new_data, id_col=['entity_id', 'as_of_date']) + lrx = TheLorax(global_clf, column_names=data.columns.values, + test_mat=new_data, + id_col=['entity_id', 'as_of_date'], + date_col='as_of_date', + outcome_col='outcome') + out_multi_rows = lrx.explain_example(idx=(1, '2017-08-21 18:01:57.040781'), pred_class=1, graph=False) @@ -207,4 +242,4 @@ def test_multiple_rows_per_entity_id(self): if __name__ == "__main__": - unittest.main(exit=False) + unittest.main(exit=True) From 216af406c0b9b8ee9c3efdc1da074a133c0a7820 Mon Sep 17 00:00:00 2001 From: Kasun Amarasinghe Date: Thu, 27 Feb 2020 16:05:29 -0500 Subject: [PATCH 73/80] PEP8 --- lorax/lorax.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index e088399..440e983 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -59,7 +59,12 @@ class TheLorax(object): """ - def __init__(self, clf, column_names, column_patterns=None, test_mat=None, id_col=None, date_col=None, outcome_col=None): + def __init__(self, clf, column_names, + column_patterns=None, + test_mat=None, + id_col=None, + date_col=None, outcome_col=None): + self.clf = clf # NOTE: Minor. maybe this should be feature_names and feature_patterns @@ -318,14 +323,12 @@ def explain_example(self, else: fstats = self.populate_feature_stats(test_mat) - contrib_df = self._build_contrib_df( - contrib_list, - test_mat=test_mat, - idx=idx, - sample=sample, - feature_stats=fstats, - how=how - ) + contrib_df = self._build_contrib_df(contrib_list, + test_mat=test_mat, + idx=idx, + sample=sample, + feature_stats=fstats, + how=how) # adding overall feature importance from model level overall_importance = [] @@ -349,15 +352,18 @@ def explain_example(self, ) if graph: - self._plot_graph(idx, pred_class, score, - num_features, contrib_df, how) + self._plot_graph(idx, + pred_class, + score, + num_features, + contrib_df, + how) else: contrib_df = self._build_contrib_df_sample(contrib_list, how=how) return contrib_df - def old_init(self, clf, test_mat, id_col=None, date_col='as_of_date', outcome_col='outcome', name_patterns=None): @@ -548,7 +554,6 @@ def _build_contrib_df_sample(self, mean_by_trees_list, how): return contrib_df - def _build_contrib_df(self, mean_by_trees_list, test_mat, idx, sample, feature_stats, how='features'): """ Build contribution dataframe. From 436c42925c7d03a9a30dee07e446f52a39f2ff61 Mon Sep 17 00:00:00 2001 From: Kasun Amarasinghe Date: Thu, 27 Feb 2020 16:09:33 -0500 Subject: [PATCH 74/80] sample explanation patterns --- lorax/lorax.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lorax/lorax.py b/lorax/lorax.py index 440e983..6049ece 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -549,6 +549,12 @@ def _build_contrib_df_sample(self, mean_by_trees_list, how): columns=['feature', 'contribution']) contrib_df.set_index('feature', inplace=True) + # if we're using name patterns, aggregate columns to pattern level, + # otherwise, join on column-level statistics (not available for pattern-level) + if how == 'patterns': + contrib_df = contrib_df.join(self.column_patterns, how='inner') + contrib_df = contrib_df.groupby(['name_pattern'])['contribution'].sum().to_frame() + # sort the resulting dataframe in descending order by contribution contrib_df.sort_values('contribution', ascending=False, inplace=True) From dc807963648fc13199c13260e7e93bf10cdf27ae Mon Sep 17 00:00:00 2001 From: Kasun Amarasinghe Date: Thu, 27 Feb 2020 16:14:06 -0500 Subject: [PATCH 75/80] PEP8 --- lorax/lorax.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lorax/lorax.py b/lorax/lorax.py index 6049ece..8e969b8 100644 --- a/lorax/lorax.py +++ b/lorax/lorax.py @@ -479,7 +479,7 @@ def set_name_patterns(self, name_patterns): regex mapping associated with the object). Arguments: - name_patters (list) A list of regex patterns or compiled regex objects to + name_patterns (list) A list of regex patterns or compiled regex objects to group together features for reporting contributions. If using, each feature name in the test matrix must match one and only one pattern. """ @@ -808,7 +808,6 @@ def explain_example_old(self, idx, pred_class=None, num_features=10, graph=True, else: return contrib_df - def speak_for_the_trees(self, id, pred_class=None, num_features=20, graph=True, how='features'): """Explain an example's score. From 92b95cc206f0fe070ac8551b46fd9facc1d5f455 Mon Sep 17 00:00:00 2001 From: Kasun Amarasinghe Date: Thu, 27 Feb 2020 16:18:19 -0500 Subject: [PATCH 76/80] pep8 --- tests/test_lorax.py | 79 +++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/tests/test_lorax.py b/tests/test_lorax.py index 2c62eeb..187c59f 100644 --- a/tests/test_lorax.py +++ b/tests/test_lorax.py @@ -54,11 +54,13 @@ class TestLorax(unittest.TestCase): def test_calculated_feature_importances(self): """Test calculated feature importances.""" # Setting up lorax - lrx = TheLorax(clf=global_clf, column_names=data.columns.values, - test_mat=data, - id_col='entity_id', - date_col='as_of_date', - outcome_col='outcome') + lrx = TheLorax(clf=global_clf, + column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + lrx_out = lrx.explain_example(idx=1, pred_class=1, graph=False) feature1_contrib = lrx_out.contribution.loc['feature1'] @@ -85,11 +87,13 @@ def test_aggregated_dict(self): clf = clf.fit(X, y) # Setting up lorax - lrx = TheLorax(clf, column_names=data.columns.values, - test_mat=data, - id_col='entity_id', - date_col='as_of_date', - outcome_col='outcome') + lrx = TheLorax(clf, + column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + _ = lrx.explain_example(idx=1, pred_class=1, graph=False) # Max depth is 1. Number of split_occurences must be equal to @@ -107,11 +111,12 @@ def test_logistic_regression_importances(self): clf.fit(X, y) # Setting up lorax - lrx = TheLorax(clf, column_names=data.columns.values, - test_mat=data, - id_col='entity_id', - date_col='as_of_date', - outcome_col='outcome') + lrx = TheLorax(clf, + column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') lrx_out = lrx.explain_example(idx=1, pred_class=1, graph=False) @@ -145,11 +150,12 @@ def test_size_global_dict(self): clf = clf.fit(X, y) # Setting up lorax - lrx = TheLorax(clf, column_names=data.columns.values, - test_mat=data, - id_col='entity_id', - date_col='as_of_date', - outcome_col='outcome') + lrx = TheLorax(clf, + column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') _ = lrx.explain_example(idx=1, pred_class=1, graph=False) @@ -181,11 +187,12 @@ def test_add_overall_feature_importance(self): self.assertTupleEqual(true_result[i], result[i]) # Setting up lorax - lrx = TheLorax(global_clf, column_names=data.columns.values, - test_mat=data, - id_col='entity_id', - date_col='as_of_date', - outcome_col='outcome') + lrx = TheLorax(global_clf, + column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') lrx_out = lrx.explain_example( idx=1, @@ -204,11 +211,12 @@ def test_multiple_rows_per_entity_id(self): """Test support of multiple rows per entity_id.""" # Setting up lorax # Getting output on test matrix with one row per entity_id - lrx = TheLorax(global_clf, column_names=data.columns.values, - test_mat=data, - id_col='entity_id', - date_col='as_of_date', - outcome_col='outcome') + lrx = TheLorax(global_clf, + column_names=data.columns.values, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') lrx_out = lrx.explain_example(idx=1, pred_class=1, graph=False) @@ -219,11 +227,12 @@ def test_multiple_rows_per_entity_id(self): # Checking that the output for original row of entity 1 # remains the same when using combined index - lrx = TheLorax(global_clf, column_names=data.columns.values, - test_mat=new_data, - id_col=['entity_id', 'as_of_date'], - date_col='as_of_date', - outcome_col='outcome') + lrx = TheLorax(global_clf, + column_names=data.columns.values, + test_mat=new_data, + id_col=['entity_id', 'as_of_date'], + date_col='as_of_date', + outcome_col='outcome') out_multi_rows = lrx.explain_example(idx=(1, '2017-08-21 18:01:57.040781'), pred_class=1, From 74654616c3d86a85418cb027d1d773bba6e94095 Mon Sep 17 00:00:00 2001 From: Kasun Amarasinghe Date: Thu, 27 Feb 2020 16:22:01 -0500 Subject: [PATCH 77/80] pep8 --- tests/test_lorax.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_lorax.py b/tests/test_lorax.py index 187c59f..643eb5e 100644 --- a/tests/test_lorax.py +++ b/tests/test_lorax.py @@ -7,7 +7,6 @@ import random import unittest -from unittest import TestSuite import numpy as np import pandas as pd from datetime import datetime From c2125c03a3e279e541ccccefefa15a2cd3546269 Mon Sep 17 00:00:00 2001 From: Kasun Amarasinghe Date: Thu, 27 Feb 2020 16:40:03 -0500 Subject: [PATCH 78/80] pep8 --- tests/test_new_lorax.py | 181 ++++++++++++++++++---------------------- 1 file changed, 79 insertions(+), 102 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 19d00ff..745d55c 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -4,16 +4,13 @@ sys.path.append(project_path) import pandas as pd -# from pandas.testing import assert_frame_equal import numpy as np import random from datetime import datetime from sklearn import datasets -from sklearn.datasets import load_breast_cancer from sklearn.ensemble import RandomForestClassifier from lorax.lorax import TheLorax -from lorax.utils import add_overall_feature_importance import unittest @@ -48,40 +45,40 @@ max_depth=max_depth, random_state=42).fit(X, y) + class TestLorax(unittest.TestCase): """Tests cases for Lorax.""" def test_feature_importances(self): """Test calculated feature importances.""" # Setting up lorax - lrx = TheLorax( - clf=global_clf, - column_names=features, - test_mat=data, - id_col='entity_id', - date_col='as_of_date', - outcome_col='outcome') + lrx = TheLorax(clf=global_clf, + column_names=features, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') # without id_col (zero indexed) # lrx_out = lrx.explain_example_new(test_mat=data, idx=0, pred_class=1, graph=False) sample = data.loc[0, features].values - pred_class = 0 # The label w.r.t the explanations are generated - lrx_out = lrx.explain_example( - sample=sample, - test_mat=None, - descriptive=True, - idx=None, - pred_class=pred_class, - num_features=10, - graph=False + pred_class = 0 # The label w.r.t the explanations are generated + lrx_out = lrx.explain_example(sample=sample, + test_mat=None, + descriptive=True, + idx=None, + pred_class=pred_class, + num_features=10, + graph=False ) feature1_contrib = lrx_out.contribution.loc['feature1'] feature5_contrib = lrx_out.contribution.loc['feature5'] print('Asserting feature importance scores...') + # Test cases for correct feature importances if pred_class == 1: self.assertEqual(feature1_contrib, 0.04889021376498209) @@ -95,14 +92,12 @@ def test_feature_importances(self): def test_feature_stats(self): """Testing the data loader""" - lrx = TheLorax( - clf=global_clf, - column_names=features, - test_mat=data, - id_col='entity_id', - date_col='as_of_date', - outcome_col='outcome' - ) + lrx = TheLorax(clf=global_clf, + column_names=features, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') st1 = lrx.populate_feature_stats(data[features]) @@ -125,40 +120,31 @@ def test_old_vs_new_lorax(self): The old method was emoved from the class """ pass - lrx = TheLorax( - clf=global_clf, - column_names=features, - test_mat=data, - id_col='entity_id', - date_col='as_of_date', - outcome_col='outcome' - ) - - pred_class = 0 # The label w.r.t the explanations are generated + lrx = TheLorax(clf=global_clf, + column_names=features, + test_mat=data, + id_col='entity_id', + date_col='as_of_date', + outcome_col='outcome') + + pred_class = 0 # The label w.r.t the explanations are generated idx = 2 - lrx_out_new = lrx.explain_example( - sample=None, - test_mat=None, - descriptive=True, - idx=idx, - pred_class=pred_class, - num_features=10, - graph=False - ) - - lrx_out_old = lrx.explain_example_old( - idx=idx, - pred_class=pred_class, - num_features=10, - graph=False, - how='features' - ) + lrx_out_new = lrx.explain_example(sample=None, + test_mat=None, + descriptive=True, + idx=idx, + pred_class=pred_class, + num_features=10, + graph=False) + + lrx_out_old = lrx.explain_example_old(idx=idx, + pred_class=pred_class, + num_features=10, + graph=False, + how='features') pd.testing.assert_frame_equal(lrx_out_new, lrx_out_old) - print() - - def test_explanation_patterns(self): """ Testing whether the explanations interms of @@ -166,9 +152,11 @@ def test_explanation_patterns(self): """ # Creating the data with cateorical columns to have regex feature patterns - X2, y2 = datasets.make_classification(n_samples=10000, n_features=6, - n_informative=3, n_redundant=2, - random_state=42) + X2, y2 = datasets.make_classification(n_samples=10000, + n_features=6, + n_informative=3, + n_redundant=2, + random_state=42) data2 = np.append(X2, y2.reshape(y2.shape[0], 1), axis=1) columns2 = ["feature1", "feature2", @@ -177,11 +165,7 @@ def test_explanation_patterns(self): data2 = pd.DataFrame(data2, columns=columns2) # Creating the categorical features - data2['category']= pd.cut( - data2['category'], - bins=2, - labels=['a','b'] - ) + data2['category'] = pd.cut(data2['category'], bins=2, labels=['a', 'b']) data2 = pd.get_dummies(data2, columns=['category']) @@ -189,48 +173,41 @@ def test_explanation_patterns(self): n_estimators = 10 max_depth = 4 - clf = RandomForestClassifier( - n_estimators=n_estimators, - max_depth=max_depth, - random_state=42).fit(data2[features2].values, y2) - - lrx = TheLorax( - clf=clf, - column_names=features2, - column_patterns=['feature'], - test_mat=data2, - id_col=None, - date_col=None, - outcome_col='outcome' - ) + clf = RandomForestClassifier(n_estimators=n_estimators, + max_depth=max_depth, + random_state=42).fit(data2[features2].values, y2) + + lrx = TheLorax(clf=clf, + column_names=features2, + column_patterns=['feature'], + test_mat=data2, + id_col=None, + date_col=None, + outcome_col='outcome') idx = 0 sample = data2.loc[0, features2].values - pred_class = 0 # The label w.r.t the explanations are generated - - lrx_out = lrx.explain_example( - sample=None, - test_mat=None, - descriptive=True, - idx=idx, - pred_class=pred_class, - num_features=10, - graph=False, - how='patterns' - ) - - lrx_out_old = lrx.explain_example_old( - idx=idx, - pred_class=pred_class, - num_features=10, - graph=False, - how='patterns' - ) - - # Assreting that both methods yield the same answer + pred_class = 0 # The label w.r.t the explanations are generated + + lrx_out = lrx.explain_example(sample=None, + test_mat=None, + descriptive=True, + idx=idx, + pred_class=pred_class, + num_features=10, + graph=False, + how='patterns') + + lrx_out_old = lrx.explain_example_old(idx=idx, + pred_class=pred_class, + num_features=10, + graph=False, + how='patterns') + + # Asserting that both methods yield the same answer pd.testing.assert_frame_equal(lrx_out, lrx_out_old) if __name__ == '__main__': unittest.main() - + From 9a75118feb75f70a8a5412692b701bce64e8d9a8 Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 27 Feb 2020 22:29:54 +0000 Subject: [PATCH 79/80] fixed error --- tests/test_new_lorax.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_new_lorax.py b/tests/test_new_lorax.py index 745d55c..49cfa1d 100644 --- a/tests/test_new_lorax.py +++ b/tests/test_new_lorax.py @@ -119,7 +119,6 @@ def test_old_vs_new_lorax(self): returned the same results as the old one. The old method was emoved from the class """ - pass lrx = TheLorax(clf=global_clf, column_names=features, test_mat=data, @@ -179,7 +178,7 @@ def test_explanation_patterns(self): lrx = TheLorax(clf=clf, column_names=features2, - column_patterns=['feature'], + column_patterns=['feature', 'category_'], test_mat=data2, id_col=None, date_col=None, From bac348139057abf9276f11ed2241db4a7817203c Mon Sep 17 00:00:00 2001 From: Kasun A Date: Thu, 27 Feb 2020 23:49:00 +0000 Subject: [PATCH 80/80] rounding importances to 5 points --- tests/test_lorax.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_lorax.py b/tests/test_lorax.py index 643eb5e..5e2ea53 100644 --- a/tests/test_lorax.py +++ b/tests/test_lorax.py @@ -122,9 +122,15 @@ def test_logistic_regression_importances(self): feature1_contrib = lrx_out.contribution.loc['feature1'] feature5_contrib = lrx_out.contribution.loc['feature5'] + feature1_contrib = round(feature1_contrib, 5) + feature5_contrib = round(feature5_contrib, 5) + true_feature1_contrib = round(2.186415806126551, 5) + true_feature5_contrib = round(-3.228614405467005, 5) + + # Test cases for correct feature importances - self.assertEqual(feature1_contrib, 2.186415806126551) - self.assertEqual(feature5_contrib, -3.228614405467005) + self.assertEqual(feature1_contrib, true_feature1_contrib) + self.assertEqual(feature5_contrib, true_feature5_contrib) # Test case if we can recover lr prediction # Can't use all of sample because it now contains intercept as last element