Skip to content

Commit 11e4a4a

Browse files
committed
refactor: replace pandas apply with more efficient means (#655)
1 parent 22dbece commit 11e4a4a

File tree

6 files changed

+26
-23
lines changed

6 files changed

+26
-23
lines changed

docs/sources/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ The CHANGELOG for the current development version is available at
2525

2626
- Implemented both `use_clones` and `fit_base_estimators` (previously `refit` in `EnsembleVoteClassifier`) for `EnsembleVoteClassifier` and `StackingClassifier`. ([#670](https://github.com/rasbt/mlxtend/pull/670) via [Katrina Ni](https://github.com/nilichen))
2727

28+
- Improve the runtime performance for apriori function and tests by replacing pandas' .apply with numpy's vectorize as the data show vectorize is faster than apply for the current use. ([#655](https://github.com/rasbt/mlxtend/pull/655) via [Kyle Yang](https://github.com/keyanyang))
29+
30+
- Improve the efficiency for generate_itemsets function by replacing Python lists with Numpy's arrays and replacing iterative division with array division. ([#655](https://github.com/rasbt/mlxtend/pull/655) via [Kyle Yang](https://github.com/keyanyang))
31+
2832
##### Bug Fixes
2933

3034
- Fix axis DeprecationWarning in matplotlib v3.1.0 and newer. ([#673](https://github.com/rasbt/mlxtend/pull/673))

mlxtend/frequent_patterns/apriori.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,8 +324,8 @@ def _support(_x, _n_rows, _is_sparse):
324324
res_df.columns = ['support', 'itemsets']
325325
if use_colnames:
326326
mapping = {idx: item for idx, item in enumerate(df.columns)}
327-
res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
328-
mapping[i] for i in x]))
327+
res_df['itemsets'] = [frozenset(a) for a in np.vectorize(map)(
328+
mapping.get, res_df['itemsets'])]
329329
res_df = res_df.reset_index(drop=True)
330330

331331
if verbose:

mlxtend/frequent_patterns/fpcommon.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,18 +59,19 @@ def setup_fptree(df, min_support):
5959

6060

6161
def generate_itemsets(generator, num_itemsets, colname_map):
62-
itemsets = []
63-
supports = []
62+
itemsets = np.array([])
63+
supports = np.array([])
6464
for sup, iset in generator:
65-
itemsets.append(frozenset(iset))
66-
supports.append(sup / num_itemsets)
65+
itemsets = np.append(itemsets, frozenset(iset))
66+
supports = np.append(supports, sup)
67+
68+
supports = np.divide(supports, num_itemsets)
6769

6870
res_df = pd.DataFrame({'support': supports, 'itemsets': itemsets})
6971

7072
if colname_map is not None:
71-
res_df['itemsets'] = res_df['itemsets'] \
72-
.apply(lambda x: frozenset([colname_map[i] for i in x]))
73-
73+
res_df['itemsets'] = [frozenset(a) for a in np.vectorize(map)(
74+
colname_map.get, res_df['itemsets'])]
7475
return res_df
7576

7677

mlxtend/frequent_patterns/tests/test_association_rules.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,8 @@
2626

2727
def test_default():
2828
res_df = association_rules(df_freq_items)
29-
res_df['antecedents'] = res_df['antecedents'].apply(
30-
lambda x: str(frozenset(x)))
31-
res_df['consequents'] = res_df['consequents'].apply(
32-
lambda x: str(frozenset(x)))
29+
res_df['antecedents'] = np.vectorize(str)(res_df['antecedents'])
30+
res_df['consequents'] = np.vectorize(str)(res_df['consequents'])
3331
res_df.sort_values(columns_ordered, inplace=True)
3432
res_df.reset_index(inplace=True, drop=True)
3533

@@ -46,10 +44,10 @@ def test_default():
4644
columns=columns_ordered
4745
)
4846

49-
expect['antecedents'] = expect['antecedents'].apply(
50-
lambda x: str(frozenset(x)))
51-
expect['consequents'] = expect['consequents'].apply(
52-
lambda x: str(frozenset(x)))
47+
expect['antecedents'] = np.vectorize(str)(
48+
np.vectorize(frozenset)(expect['antecedents']))
49+
expect['consequents'] = np.vectorize(str)(
50+
np.vectorize(frozenset)(expect['consequents']))
5351
expect.sort_values(columns_ordered, inplace=True)
5452
expect.reset_index(inplace=True, drop=True)
5553

@@ -68,8 +66,8 @@ def test_datatypes():
6866
# check if association_rule converts it internally
6967
# back to frozensets
7068
df_freq_items_copy = df_freq_items.copy()
71-
df_freq_items_copy['itemsets'] = df_freq_items_copy['itemsets']\
72-
.apply(lambda x: set(x))
69+
df_freq_items_copy['itemsets'] = np.vectorize(set)(
70+
df_freq_items_copy['itemsets'])
7371

7472
res_df = association_rules(df_freq_items)
7573
for i in res_df['antecedents']:

mlxtend/frequent_patterns/tests/test_fpbase.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,11 +201,11 @@ def test_default(self):
201201

202202
def test_max_len(self):
203203
res_df1 = self.fpalgo(self.df)
204-
max_len = np.max(res_df1['itemsets'].apply(len))
204+
max_len = np.vectorize(len)(res_df1['itemsets']).max()
205205
assert max_len == 3
206206

207207
res_df2 = self.fpalgo(self.df, max_len=2)
208-
max_len = np.max(res_df2['itemsets'].apply(len))
208+
max_len = np.vectorize(len)(res_df2['itemsets']).max()
209209
assert max_len == 2
210210

211211
def test_low_memory_flag(self):

mlxtend/frequent_patterns/tests/test_fpmax.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ def test_default(self):
3232

3333
def test_max_len(self):
3434
res_df1 = fpmax(self.df)
35-
max_len = np.max(res_df1['itemsets'].apply(len))
35+
max_len = np.vectorize(len)(res_df1['itemsets']).max()
3636
assert max_len == 3
3737

3838
res_df2 = fpmax(self.df, max_len=2)
39-
max_len = np.max(res_df2['itemsets'].apply(len))
39+
max_len = np.vectorize(len)(res_df2['itemsets']).max()
4040
assert max_len == 2
4141

4242

0 commit comments

Comments
 (0)