From e34ff8c9f84c0f951b3169d93b37b89b83e42c84 Mon Sep 17 00:00:00 2001 From: Denis Barbier Date: Sun, 15 Dec 2019 11:02:54 +0100 Subject: [PATCH 1/6] Apriori: implement join step of apriori-gen The apriori-gen function described in section 2.1.1 of Apriori paper has two steps; first, the join step looks for itemsets with the same prefix, and creates new candidates by appending all pairs combinations to this prefix. Here is pseudocode copied from paper: select p.1, p.2, ..., p.k-1, q.k-1 from p in L(k-1), q in L(k-1) where p.1 = q.1, ..., p.k-2 = q.k-2, p.k-1 < q.k-1 The reason is that if a sequence q with the same prefix as p does not belong to L(k-1), itemset p+(q.k-1,) cannot be frequent. Before this commit, we were considering p+(q.k-1,) for any q.k-1 > p.k-1. The second step of apriori-gen function is called prune step, it will be implemented in a distinct commit. See discussion in #644. --- mlxtend/frequent_patterns/apriori.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py index 8e41e75ae..8d9170084 100644 --- a/mlxtend/frequent_patterns/apriori.py +++ b/mlxtend/frequent_patterns/apriori.py @@ -30,8 +30,10 @@ def generate_new_combinations(old_combinations): Returns ----------- - Generator of all combinations from the last step x items - from the previous step. + Generator of combinations based on the last state of Apriori algorithm. + In order to reduce number of candidates, this function implements the + join step of apriori-gen described in section 2.1.1 of Apriori paper. + Prune step is not yet implemented. Examples ----------- @@ -40,15 +42,17 @@ def generate_new_combinations(old_combinations): """ - items_types_in_previous_step = np.unique(old_combinations.flatten()) - for old_combination in old_combinations: - max_combination = old_combination[-1] - mask = items_types_in_previous_step > max_combination - valid_items = items_types_in_previous_step[mask] - old_tuple = tuple(old_combination) - for item in valid_items: - yield from old_tuple - yield item + length = len(old_combinations) + for i, old_combination in enumerate(old_combinations): + head_i = list(old_combination[:-1]) + j = i + 1 + while j < length: + *head_j, tail_j = old_combinations[j] + if head_i != head_j: + break + yield from old_combination + yield tail_j + j = j + 1 def generate_new_combinations_low_memory(old_combinations, X, min_support, From f8131a73182ec3dad56890849a7d553331b97c6c Mon Sep 17 00:00:00 2001 From: Denis Barbier Date: Wed, 18 Dec 2019 10:42:49 +0100 Subject: [PATCH 2/6] Apriori: implement prune step of apriori-gen The apriori-gen function described in section 2.1.1 of Apriori paper has two steps; the first step had been implemented in previous commit. The second step of apriori-gen function is called prune step, it takes candidates c from first step and check that all (k-1) tuples built by removing any single element from c is in L(k-1). As Numpy arrays are not hashable, we cannot use set() for itemset lookup, and define a very simple prefix tree class. --- mlxtend/frequent_patterns/apriori.py | 56 ++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py index 8d9170084..4dec35dc0 100644 --- a/mlxtend/frequent_patterns/apriori.py +++ b/mlxtend/frequent_patterns/apriori.py @@ -9,6 +9,44 @@ from ..frequent_patterns import fpcommon as fpc +class _FixedLengthTrie: + + """Fixed-length trie (prefix tree). + + Parameters + ---------- + combinations: list of itemsets + All combinations with enough support in the last step + + Attributes + ---------- + root : dict + Root node + """ + __slots__ = ("root") + + def __init__(self, combinations): + self.root = dict() + for combination in combinations: + current = self.root + for item in combination: + try: + current = current[item] + except KeyError: + next_node = dict() + current[item] = next_node + current = next_node + + def __contains__(self, combination): + current = self.root + try: + for item in combination: + current = current[item] + return True + except KeyError: + return False + + def generate_new_combinations(old_combinations): """ Generator of all combinations based on the last state of Apriori algorithm @@ -32,8 +70,7 @@ def generate_new_combinations(old_combinations): ----------- Generator of combinations based on the last state of Apriori algorithm. In order to reduce number of candidates, this function implements the - join step of apriori-gen described in section 2.1.1 of Apriori paper. - Prune step is not yet implemented. + apriori-gen function described in section 2.1.1 of Apriori paper. Examples ----------- @@ -43,6 +80,7 @@ def generate_new_combinations(old_combinations): """ length = len(old_combinations) + trie = _FixedLengthTrie(old_combinations) for i, old_combination in enumerate(old_combinations): head_i = list(old_combination[:-1]) j = i + 1 @@ -50,8 +88,18 @@ def generate_new_combinations(old_combinations): *head_j, tail_j = old_combinations[j] if head_i != head_j: break - yield from old_combination - yield tail_j + # Prune old_combination+(item,) if any subset is not frequent + candidate = tuple(old_combination) + (tail_j,) + # No need to check the last two values, because test_candidate + # is then old_combinations[i] and old_combinations[j] + for idx in range(len(candidate) - 2): + test_candidate = list(candidate) + del test_candidate[idx] + if test_candidate not in trie: + # early exit from for-loop skips else clause just below + break + else: + yield from candidate j = j + 1 From 85ca67d3da0ff950b787ca6e703f44b182366c3a Mon Sep 17 00:00:00 2001 From: Denis Barbier Date: Thu, 19 Dec 2019 10:38:22 +0100 Subject: [PATCH 3/6] Let apriori always use low_memory processing Thanks to previous optimizations, processing with low_memory=True is now as efficient as with low_memory=False, and allows to process much larger datasets. Removing processing with low_memory=False makes code simpler. The downside is that we do not know in advance the number of itemsets to process, thus it is displayed afterwards. We now display the number of itemsets after prune step. Note that commit 2f928cb introduced a bug, the number of processing combinations was multiplied by itemset's length. Since vectorized operations are no more performed on frequent itemsets, they are stored as list of tuples. --- mlxtend/frequent_patterns/apriori.py | 208 +++++------------- .../frequent_patterns/tests/test_fpbase.py | 2 +- 2 files changed, 55 insertions(+), 155 deletions(-) diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py index 4dec35dc0..f8c4ac6d7 100644 --- a/mlxtend/frequent_patterns/apriori.py +++ b/mlxtend/frequent_patterns/apriori.py @@ -52,18 +52,17 @@ def generate_new_combinations(old_combinations): Generator of all combinations based on the last state of Apriori algorithm Parameters ----------- - old_combinations: np.array + old_combinations: list of tuples All combinations with enough support in the last step - Combinations are represented by a matrix. - Number of columns is equal to the combination size + Combinations are represented by a list of tuples. + All tuples have the same length, which is equal to the combination size of the previous step. - Each row represents one combination + Each tuple represents one combination and contains item type ids in the ascending order ``` - 0 1 - 0 15 20 - 1 15 22 - 2 17 19 + 15 20 + 15 22 + 17 19 ``` Returns @@ -89,7 +88,7 @@ def generate_new_combinations(old_combinations): if head_i != head_j: break # Prune old_combination+(item,) if any subset is not frequent - candidate = tuple(old_combination) + (tail_j,) + candidate = old_combination + (tail_j,) # No need to check the last two values, because test_candidate # is then old_combinations[i] and old_combinations[j] for idx in range(len(candidate) - 2): @@ -99,90 +98,10 @@ def generate_new_combinations(old_combinations): # early exit from for-loop skips else clause just below break else: - yield from candidate + yield candidate j = j + 1 -def generate_new_combinations_low_memory(old_combinations, X, min_support, - is_sparse): - """ - Generator of all combinations based on the last state of Apriori algorithm - Parameters - ----------- - old_combinations: np.array - All combinations with enough support in the last step - Combinations are represented by a matrix. - Number of columns is equal to the combination size - of the previous step. - Each row represents one combination - and contains item type ids in the ascending order - ``` - 0 1 - 0 15 20 - 1 15 22 - 2 17 19 - ``` - - X: np.array or scipy sparse matrix - The allowed values are either 0/1 or True/False. - For example, - - ``` - 0 True False True True False True - 1 True False True False False True - 2 True False True False False False - 3 True True False False False False - 4 False False True True True True - 5 False False True False True True - 6 False False True False True False - 7 True True False False False False - ``` - - min_support : float (default: 0.5) - A float between 0 and 1 for minumum support of the itemsets returned. - The support is computed as the fraction - `transactions_where_item(s)_occur / total_transactions`. - - is_sparse : bool True if X is sparse - - Returns - ----------- - Generator of all combinations from the last step x items - from the previous step. Every combination contains the - number of transactions where this item occurs, followed - by item type ids in the ascending order. - No combination other than generated - do not have a chance to get enough support - - Examples - ----------- - For usage examples, please see - http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/ - - """ - - items_types_in_previous_step = np.unique(old_combinations.flatten()) - rows_count = X.shape[0] - threshold = min_support * rows_count - for old_combination in old_combinations: - max_combination = old_combination[-1] - mask = items_types_in_previous_step > max_combination - valid_items = items_types_in_previous_step[mask] - old_tuple = tuple(old_combination) - if is_sparse: - mask_rows = X[:, old_tuple].toarray().all(axis=1) - X_cols = X[:, valid_items].toarray() - supports = X_cols[mask_rows].sum(axis=0) - else: - mask_rows = X[:, old_tuple].all(axis=1) - supports = X[mask_rows][:, valid_items].sum(axis=0) - valid_indices = (supports >= threshold).nonzero()[0] - for index in valid_indices: - yield supports[index] - yield from old_tuple - yield valid_items[index] - - def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, low_memory=False): """Get frequent itemsets from a one-hot DataFrame @@ -220,16 +139,7 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, possible itemsets lengths (under the apriori condition) are evaluated. verbose : int (default: 0) - Shows the number of iterations if >= 1 and `low_memory` is `True`. If - >=1 and `low_memory` is `False`, shows the number of combinations. - - low_memory : bool (default: False) - If `True`, uses an iterator to search for combinations above - `min_support`. - Note that while `low_memory=True` should only be used for large dataset - if memory resources are limited, because this implementation is approx. - 3-6x slower than the default. - + Shows the number of combinations if >= 1. Returns ----------- @@ -292,6 +202,8 @@ def _support(_x, _n_rows, _is_sparse): X = df.values else: X = df.to_coo().tocsc() + # See comment below + X.eliminate_zeros() is_sparse = True elif hasattr(df, "sparse"): # DataFrame with SparseArray (pandas >= 0.24) @@ -299,73 +211,61 @@ def _support(_x, _n_rows, _is_sparse): X = df.values else: X = df.sparse.to_coo().tocsc() + # See comment below + X.eliminate_zeros() is_sparse = True else: # dense DataFrame X = df.values is_sparse = False support = _support(X, X.shape[0], is_sparse) - ary_col_idx = np.arange(X.shape[1]) support_dict = {1: support[support >= min_support]} - itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)} + itemset_dict = {1: [(idx,) for idx in np.where(support >= min_support)[0]]} max_itemset = 1 - rows_count = float(X.shape[0]) - - all_ones = np.ones((int(rows_count), 1)) while max_itemset and max_itemset < (max_len or float('inf')): next_max_itemset = max_itemset + 1 - # With exceptionally large datasets, the matrix operations can use a - # substantial amount of memory. For low memory applications or large - # datasets, set `low_memory=True` to use a slower but more memory- - # efficient implementation. - if low_memory: - combin = generate_new_combinations_low_memory( - itemset_dict[max_itemset], X, min_support, is_sparse) - # slightly faster than creating an array from a list of tuples - combin = np.fromiter(combin, dtype=int) - combin = combin.reshape(-1, next_max_itemset + 1) - - if combin.size == 0: - break - if verbose: - print( - '\rProcessing %d combinations | Sampling itemset size %d' % - (combin.size, next_max_itemset), end="") - - itemset_dict[next_max_itemset] = combin[:, 1:] - support_dict[next_max_itemset] = combin[:, 0].astype(float) \ - / rows_count - max_itemset = next_max_itemset + combin = generate_new_combinations(itemset_dict[max_itemset]) + # count supports + frequent_itemsets = [] + frequent_supports = [] + processed = 0 + if is_sparse: + count = np.empty(X.shape[0], dtype=int) + for itemset in combin: + processed += 1 + count[:] = 0 + for item in itemset: + # Count nonnull entries via direct access to X indices; + # this requires X to be stored in CSC format, and to call + # X.eliminate_zeros() to remove null entries from X. + count[X.indices[X.indptr[item]:X.indptr[item+1]]] += 1 + support = np.count_nonzero(count == len(itemset)) / X.shape[0] + if support >= min_support: + frequent_itemsets.append(itemset) + frequent_supports.append(support) else: - combin = generate_new_combinations(itemset_dict[max_itemset]) - combin = np.fromiter(combin, dtype=int) - combin = combin.reshape(-1, next_max_itemset) - - if combin.size == 0: - break - if verbose: - print( - '\rProcessing %d combinations | Sampling itemset size %d' % - (combin.size, next_max_itemset), end="") - - if is_sparse: - _bools = X[:, combin[:, 0]] == all_ones - for n in range(1, combin.shape[1]): - _bools = _bools & (X[:, combin[:, n]] == all_ones) - else: - _bools = np.all(X[:, combin], axis=2) - - support = _support(np.array(_bools), rows_count, is_sparse) - _mask = (support >= min_support).reshape(-1) - if any(_mask): - itemset_dict[next_max_itemset] = np.array(combin[_mask]) - support_dict[next_max_itemset] = np.array(support[_mask]) - max_itemset = next_max_itemset - else: - # Exit condition - break + _bools = np.empty(X.shape[0], dtype=bool) + for itemset in combin: + processed += 1 + _bools.fill(True) + for item in itemset: + np.logical_and(_bools, X[:, item], out=_bools) + support = np.count_nonzero(_bools) / X.shape[0] + if support >= min_support: + frequent_itemsets.append(itemset) + frequent_supports.append(support) + if not frequent_itemsets: + # Exit condition + break + if verbose: + print( + '\rProcessed %d combinations | Sampling itemset size %d' % + (processed, next_max_itemset), end="") + itemset_dict[next_max_itemset] = frequent_itemsets + support_dict[next_max_itemset] = frequent_supports + max_itemset = next_max_itemset all_res = [] for k in sorted(itemset_dict): diff --git a/mlxtend/frequent_patterns/tests/test_fpbase.py b/mlxtend/frequent_patterns/tests/test_fpbase.py index 4a7f79e12..30551a03c 100644 --- a/mlxtend/frequent_patterns/tests/test_fpbase.py +++ b/mlxtend/frequent_patterns/tests/test_fpbase.py @@ -229,7 +229,7 @@ def test_low_memory_flag(self): _ = self.fpalgo(self.df, low_memory=True, verbose=1) # Only get the last value of the stream to reduce test noise - expect = 'Processing 4 combinations | Sampling itemset size 3\n' + expect = 'Processed 1 combinations | Sampling itemset size 3\n' out = out.getvalue().split('\r')[-1] assert out == expect else: From 29170655e07d11eb5b3c9276cb2c973f5359e843 Mon Sep 17 00:00:00 2001 From: Denis Barbier Date: Thu, 2 Jan 2020 13:14:03 +0100 Subject: [PATCH 4/6] Replace _FixedLengthTrie by set This is now possible because tuples are hashable. --- mlxtend/frequent_patterns/apriori.py | 42 ++-------------------------- 1 file changed, 2 insertions(+), 40 deletions(-) diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py index f8c4ac6d7..777ba6ba4 100644 --- a/mlxtend/frequent_patterns/apriori.py +++ b/mlxtend/frequent_patterns/apriori.py @@ -9,44 +9,6 @@ from ..frequent_patterns import fpcommon as fpc -class _FixedLengthTrie: - - """Fixed-length trie (prefix tree). - - Parameters - ---------- - combinations: list of itemsets - All combinations with enough support in the last step - - Attributes - ---------- - root : dict - Root node - """ - __slots__ = ("root") - - def __init__(self, combinations): - self.root = dict() - for combination in combinations: - current = self.root - for item in combination: - try: - current = current[item] - except KeyError: - next_node = dict() - current[item] = next_node - current = next_node - - def __contains__(self, combination): - current = self.root - try: - for item in combination: - current = current[item] - return True - except KeyError: - return False - - def generate_new_combinations(old_combinations): """ Generator of all combinations based on the last state of Apriori algorithm @@ -79,7 +41,7 @@ def generate_new_combinations(old_combinations): """ length = len(old_combinations) - trie = _FixedLengthTrie(old_combinations) + set_old_combinations = set(old_combinations) for i, old_combination in enumerate(old_combinations): head_i = list(old_combination[:-1]) j = i + 1 @@ -94,7 +56,7 @@ def generate_new_combinations(old_combinations): for idx in range(len(candidate) - 2): test_candidate = list(candidate) del test_candidate[idx] - if test_candidate not in trie: + if tuple(test_candidate) not in set_old_combinations: # early exit from for-loop skips else clause just below break else: From eb8066789990b540a2b5093333685dec45bdc7eb Mon Sep 17 00:00:00 2001 From: Denis Barbier Date: Fri, 3 Jan 2020 12:09:59 +0100 Subject: [PATCH 5/6] Replace _support function For unknbown reasons, np.sum is slow on a very large boolean array. --- mlxtend/frequent_patterns/apriori.py | 41 ++++++++-------------------- 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py index 777ba6ba4..18ed98fcd 100644 --- a/mlxtend/frequent_patterns/apriori.py +++ b/mlxtend/frequent_patterns/apriori.py @@ -121,32 +121,6 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, """ - def _support(_x, _n_rows, _is_sparse): - """DRY private method to calculate support as the - row-wise sum of values / number of rows - - Parameters - ----------- - - _x : matrix of bools or binary - - _n_rows : numeric, number of rows in _x - - _is_sparse : bool True if _x is sparse - - Returns - ----------- - np.array, shape = (n_rows, ) - - Examples - ----------- - For usage examples, please see - http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/ - - """ - out = (np.sum(_x, axis=0) / _n_rows) - return np.array(out).reshape(-1) - if min_support <= 0.: raise ValueError('`min_support` must be a positive ' 'number within the interval `(0, 1]`. ' @@ -180,7 +154,17 @@ def _support(_x, _n_rows, _is_sparse): # dense DataFrame X = df.values is_sparse = False - support = _support(X, X.shape[0], is_sparse) + if is_sparse: + # Count nonnull entries via direct access to X indices; + # this requires X to be stored in CSC format, and to call + # X.eliminate_zeros() to remove null entries from X. + support = np.array([X.indptr[idx+1] - X.indptr[idx] + for idx in range(X.shape[1])], dtype=int) + else: + # Faster than np.count_nonzero(X, axis=0) or np.sum(X, axis=0), why? + support = np.array([np.count_nonzero(X[:, idx]) + for idx in range(X.shape[1])], dtype=int) + support = support / X.shape[0] support_dict = {1: support[support >= min_support]} itemset_dict = {1: [(idx,) for idx in np.where(support >= min_support)[0]]} max_itemset = 1 @@ -199,9 +183,6 @@ def _support(_x, _n_rows, _is_sparse): processed += 1 count[:] = 0 for item in itemset: - # Count nonnull entries via direct access to X indices; - # this requires X to be stored in CSC format, and to call - # X.eliminate_zeros() to remove null entries from X. count[X.indices[X.indptr[item]:X.indptr[item+1]]] += 1 support = np.count_nonzero(count == len(itemset)) / X.shape[0] if support >= min_support: From 09e6e2f5d80d7443fd33ac3fb45942e2350efc8b Mon Sep 17 00:00:00 2001 From: Denis Barbier Date: Fri, 3 Jan 2020 17:09:11 +0100 Subject: [PATCH 6/6] Add benchmark script This is a work in progress. --- mlxtend/frequent_patterns/tests/benchmark.py | 105 +++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 mlxtend/frequent_patterns/tests/benchmark.py diff --git a/mlxtend/frequent_patterns/tests/benchmark.py b/mlxtend/frequent_patterns/tests/benchmark.py new file mode 100644 index 000000000..7a7435770 --- /dev/null +++ b/mlxtend/frequent_patterns/tests/benchmark.py @@ -0,0 +1,105 @@ +# Sebastian Raschka 2014-2019 +# myxtend Machine Learning Library Extensions +# Author: Sebastian Raschka +# +# License: BSD 3 clause + +from mlxtend.preprocessing import TransactionEncoder +from mlxtend.frequent_patterns import apriori +import pandas as pd +import numpy as np +import gzip +import os +import sys +from time import time +import signal +from contextlib import contextmanager + + +@contextmanager +def timeout(time): + # Register a function to raise a TimeoutError on the signal. + signal.signal(signal.SIGALRM, raise_timeout) + # Schedule the signal to be sent after ``time``. + signal.alarm(time) + + try: + yield + except TimeoutError: + pass + finally: + # Unregister the signal so it won't be triggered + # if the timeout is not reached. + signal.signal(signal.SIGALRM, signal.SIG_IGN) + + +def raise_timeout(signum, frame): + raise TimeoutError + + +files = [ + # "chess.dat.gz", + # "connect.dat.gz", + "mushroom.dat.gz", + "pumsb.dat.gz", + "pumsb_star.dat.gz", + # "T10I4D100K.dat.gz", + # "T40I10D100K.dat.gz", + # "kosarak.dat.gz", # this file is too large in sparse format + # "kosarak-1k.dat.gz", + # "kosarak-10k.dat.gz", + # "kosarak-50k.dat.gz", + # "kosarak-100k.dat.gz", + # "kosarak-200k.dat.gz", +] + + +low_memory = True +commit = "b731fd2" +test_supports = [0.5, 0.3, 0.1, 0.05, 0.03, 0.01, 0.005, 0.003, 0.001] + +for sparse, col_major in [[False, True], [False, False], [True, True]]: + sys.stdout = open("Results/{}-sparse{}-col_major{}.out".format( + commit, sparse, col_major), "w") + for filename in files: + with gzip.open(os.path.join("data", filename)) if filename.endswith( + ".gz" + ) else open(os.path.join("data", filename)) as f: + data = f.readlines() + + dataset = [list(map(int, line.split())) for line in data] + items = np.unique([item for itemset in dataset for item in itemset]) + print("{} contains {} transactions and {} items".format( + filename, len(dataset), len(items))) + + te = TransactionEncoder() + te_ary = te.fit(dataset).transform(dataset, sparse=sparse) + columns = ["c"+str(i) for i in te.columns_] + if sparse: + try: + df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=columns) + except AttributeError: + # pandas < 0.25 + df = pd.SparseDataFrame(te_ary, columns=columns, + default_fill_value=False) + else: + df = pd.DataFrame(te_ary, columns=columns) + if col_major: + df = pd.DataFrame({col: df[col] for col in df.columns}) + np.info(df.values) + + kwds = {"use_colnames": False, "low_memory": low_memory} + for min_support in test_supports: + tick = time() + with timeout(120): + print(apriori(df, min_support=min_support, verbose=1, **kwds)) + print("\nmin_support={} temps: {}\n".format( + min_support, time() - tick)) + if time() - tick < 10: + times = [] + for _ in range(5): + tick = time() + apriori(df, min_support=min_support, verbose=0, **kwds) + times.append(time() - tick) + print("Times:", times) + sys.stdout.close()