refactor: replace pandas apply with more efficient means (#655)

keyanyang · keyanyang · commit 11e4a4a1cf7b · 2020-04-30T18:48:52.000-07:00
diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -25,6 +25,10 @@ The CHANGELOG for the current development version is available at
 
 - Implemented both `use_clones` and `fit_base_estimators` (previously `refit` in `EnsembleVoteClassifier`) for `EnsembleVoteClassifier` and `StackingClassifier`. ([#670](https://github.com/rasbt/mlxtend/pull/670) via [Katrina Ni](https://github.com/nilichen))
 
+- Improve the runtime performance for apriori function and tests by replacing pandas' .apply with numpy's vectorize as the data show vectorize is faster than apply for the current use. ([#655](https://github.com/rasbt/mlxtend/pull/655) via [Kyle Yang](https://github.com/keyanyang))
+
+- Improve the efficiency for generate_itemsets function by replacing Python lists with Numpy's arrays and replacing iterative division with array division. ([#655](https://github.com/rasbt/mlxtend/pull/655) via [Kyle Yang](https://github.com/keyanyang))
+
 ##### Bug Fixes
 
 - Fix axis DeprecationWarning in matplotlib v3.1.0 and newer. ([#673](https://github.com/rasbt/mlxtend/pull/673))
diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py
@@ -324,8 +324,8 @@ def _support(_x, _n_rows, _is_sparse):
     res_df.columns = ['support', 'itemsets']
     if use_colnames:
         mapping = {idx: item for idx, item in enumerate(df.columns)}
-        res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
-                                                      mapping[i] for i in x]))
+        res_df['itemsets'] = [frozenset(a) for a in np.vectorize(map)(
+            mapping.get, res_df['itemsets'])]
     res_df = res_df.reset_index(drop=True)
 
     if verbose:
diff --git a/mlxtend/frequent_patterns/fpcommon.py b/mlxtend/frequent_patterns/fpcommon.py
@@ -59,18 +59,19 @@ def setup_fptree(df, min_support):
 
 
 def generate_itemsets(generator, num_itemsets, colname_map):
-    itemsets = []
-    supports = []
+    itemsets = np.array([])
+    supports = np.array([])
     for sup, iset in generator:
-        itemsets.append(frozenset(iset))
-        supports.append(sup / num_itemsets)
+        itemsets = np.append(itemsets, frozenset(iset))
+        supports = np.append(supports, sup)
+
+    supports = np.divide(supports, num_itemsets)
 
     res_df = pd.DataFrame({'support': supports, 'itemsets': itemsets})
 
     if colname_map is not None:
-        res_df['itemsets'] = res_df['itemsets'] \
-            .apply(lambda x: frozenset([colname_map[i] for i in x]))
-
+        res_df['itemsets'] = [frozenset(a) for a in np.vectorize(map)(
+            colname_map.get, res_df['itemsets'])]
     return res_df
 
 
diff --git a/mlxtend/frequent_patterns/tests/test_association_rules.py b/mlxtend/frequent_patterns/tests/test_association_rules.py
@@ -26,10 +26,8 @@
 
 def test_default():
     res_df = association_rules(df_freq_items)
-    res_df['antecedents'] = res_df['antecedents'].apply(
-        lambda x: str(frozenset(x)))
-    res_df['consequents'] = res_df['consequents'].apply(
-        lambda x: str(frozenset(x)))
+    res_df['antecedents'] = np.vectorize(str)(res_df['antecedents'])
+    res_df['consequents'] = np.vectorize(str)(res_df['consequents'])
     res_df.sort_values(columns_ordered, inplace=True)
     res_df.reset_index(inplace=True, drop=True)
 
@@ -46,10 +44,10 @@ def test_default():
         columns=columns_ordered
     )
 
-    expect['antecedents'] = expect['antecedents'].apply(
-        lambda x: str(frozenset(x)))
-    expect['consequents'] = expect['consequents'].apply(
-        lambda x: str(frozenset(x)))
+    expect['antecedents'] = np.vectorize(str)(
+        np.vectorize(frozenset)(expect['antecedents']))
+    expect['consequents'] = np.vectorize(str)(
+        np.vectorize(frozenset)(expect['consequents']))
     expect.sort_values(columns_ordered, inplace=True)
     expect.reset_index(inplace=True, drop=True)
 
@@ -68,8 +66,8 @@ def test_datatypes():
     # check if association_rule converts it internally
     # back to frozensets
     df_freq_items_copy = df_freq_items.copy()
-    df_freq_items_copy['itemsets'] = df_freq_items_copy['itemsets']\
-        .apply(lambda x: set(x))
+    df_freq_items_copy['itemsets'] = np.vectorize(set)(
+        df_freq_items_copy['itemsets'])
 
     res_df = association_rules(df_freq_items)
     for i in res_df['antecedents']:
diff --git a/mlxtend/frequent_patterns/tests/test_fpbase.py b/mlxtend/frequent_patterns/tests/test_fpbase.py
@@ -201,11 +201,11 @@ def test_default(self):
 
     def test_max_len(self):
         res_df1 = self.fpalgo(self.df)
-        max_len = np.max(res_df1['itemsets'].apply(len))
+        max_len = np.vectorize(len)(res_df1['itemsets']).max()
         assert max_len == 3
 
         res_df2 = self.fpalgo(self.df, max_len=2)
-        max_len = np.max(res_df2['itemsets'].apply(len))
+        max_len = np.vectorize(len)(res_df2['itemsets']).max()
         assert max_len == 2
 
     def test_low_memory_flag(self):
diff --git a/mlxtend/frequent_patterns/tests/test_fpmax.py b/mlxtend/frequent_patterns/tests/test_fpmax.py
@@ -32,11 +32,11 @@ def test_default(self):
 
     def test_max_len(self):
         res_df1 = fpmax(self.df)
-        max_len = np.max(res_df1['itemsets'].apply(len))
+        max_len = np.vectorize(len)(res_df1['itemsets']).max()
         assert max_len == 3
 
         res_df2 = fpmax(self.df, max_len=2)
-        max_len = np.max(res_df2['itemsets'].apply(len))
+        max_len = np.vectorize(len)(res_df2['itemsets']).max()
         assert max_len == 2