diff --git a/README.rst b/README.rst index 3cea35bb..ccc014c5 100644 --- a/README.rst +++ b/README.rst @@ -328,10 +328,10 @@ A more data-based approach can be taken when setting the contamination level. By .. code-block:: python from pyod.models.knn import KNN - from pyod.models.thresholds import FILTER + from pyod.models.thresholds import ZSCORE # Set the outlier detection and thresholding methods - clf = KNN(contamination=FILTER()) + clf = KNN(contamination=ZSCORE()) See supported thresholding methods in `thresholding `_. diff --git a/docs/about.rst b/docs/about.rst index f8e1b663..3eb1194d 100644 --- a/docs/about.rst +++ b/docs/about.rst @@ -61,7 +61,7 @@ Adam Goodge (PhD Researcher @ National University of Singapore): - Joined in 2022 (implemented LUNAR) - `LinkedIn (Adam Goodge) `_ -Daniel Kulik (Machine Learning Developer; MSc Student @ University of the Free State): +Daniel Kulik (Machine Learning Developer; MSc Astrophysics @ University of the Free State): - Joined 2022 (implemented integration with PyThresh and more) - `LinkedIn (Daniel Kulik) `_ diff --git a/docs/requirements.txt b/docs/requirements.txt index 92481b87..78565163 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,5 @@ combo furo -geomstats joblib matplotlib nose @@ -8,7 +7,7 @@ numpy>=1.19 numba>=0.51 pyclustering pytest -pythresh>=0.3.1 +pythresh>=1.0.0 ruptures scipy>=1.5.1 scikit-learn>=0.22.0 diff --git a/docs/thresholding.rst b/docs/thresholding.rst index b333e05b..37f0161a 100644 --- a/docs/thresholding.rst +++ b/docs/thresholding.rst @@ -5,32 +5,34 @@ ================================== ================ ================================================================ ==================================================================================================================== Type Abbr Algorithm Documentation ================================== ================ ================================================================ ==================================================================================================================== -Kernel-Based AUCP Area Under Curve Percentage `AUCP `_ -Statistical Moment-Based BOOT Bootstrapping `BOOT `_ -Normality-Based CHAU Chauvenet's Criterion `CHAU `_ -Linear Model CLF Trained Linear Classifier `CLF `_ -cluster-Based CLUST Clustering Based `CLUST `_ -Kernel-Based CPD Change Point Detection `CPD `_ -Transformation-Based DECOMP Decomposition `DECOMP `_ -Normality-Based DSN Distance Shift from Normal `DSN `_ -Curve-Based EB Elliptical Boundary `EB `_ -Kernel-Based FGD Fixed Gradient Descent `FGD `_ -Filter-Based FILTER Filtering Based `FILTER `_ -Curve-Based FWFM Full Width at Full Minimum `FWFM `_ -Statistical Test-Based GESD Generalized Extreme Studentized Deviate `GESD `_ -Filter-Based HIST Histogram Based `HIST `_ -Quantile-Based IQR Inter-Quartile Region `IQR `_ -Statistical Moment-Based KARCH Karcher mean (Riemannian Center of Mass) `KARCH `_ -Statistical Moment-Based MAD Median Absolute Deviation `MAD `_ -Statistical Test-Based MCST Monte Carlo Shapiro Tests `MCST `_ -Ensembles-Based META Meta-model Trained Classifier `META `_ -Transformation-Based MOLL Friedrichs' Mollifier `MOLL `_ -Statistical Test-Based MTT Modified Thompson Tau Test `MTT `_ -Linear Model OCSVM One-Class Support Vector Machine `OCSVM `_ -Quantile-Based QMCD Quasi-Monte Carlo Discrepancy `QMCD `_ -Linear Model REGR Regression Based `REGR `_ -Neural Networks VAE Variational Autoencoder `VAE `_ -Curve-Based WIND Topological Winding Number `WIND `_ -Transformation-Based YJ Yeo-Johnson Transformation `YJ `_ -Normality-Based ZSCORE Z-score `ZSCORE `_ +Kernel-Based AUCP Area Under Curve Percentage `AUCP `_ +Statistical Moment-Based BOOT Bootstrapping `BOOT `_ +Normality-Based CHAU Chauvenet's Criterion `CHAU `_ +Linear Model CLF Trained Linear Classifier `CLF `_ +Cluster-Based CLUST Clustering Based `CLUST `_ +Kernel-Based CPD Change Point Detection `CPD `_ +Transformation-Based DECOMP Decomposition `DECOMP `_ +Normality-Based DSN Distance Shift from Normal `DSN `_ +Curve-Based EB Elliptical Boundary `EB `_ +Kernel-Based FGD Fixed Gradient Descent `FGD `_ +Filter-Based FILTER Filtering Based `FILTER `_ +Curve-Based FWFM Full Width at Full Minimum `FWFM `_ +Statistical Test-Based GAMGMM Bayesian Contamination Estimation `GAMGMM `_ +Statistical Test-Based GESD Generalized Extreme Studentized Deviate `GESD `_ +Filter-Based HIST Histogram Based `HIST `_ +Quantile-Based IQR Inter-Quartile Region `IQR `_ +Statistical Moment-Based KARCH Karcher mean (Riemannian Center of Mass) `KARCH `_ +Statistical Moment-Based MAD Median Absolute Deviation `MAD `_ +Statistical Test-Based MCST Monte Carlo Shapiro Tests `MCST `_ +Ensembles-Based META Meta-model Trained Classifier `META `_ +Statistical Test-Based MIXMOD Normal & Non-Normal Mixture Models `MIXMOD `_ +Transformation-Based MOLL Friedrichs' Mollifier `MOLL `_ +Statistical Test-Based MTT Modified Thompson Tau Test `MTT `_ +Linear Model OCSVM One-Class Support Vector Machine `OCSVM `_ +Quantile-Based QMCD Quasi-Monte Carlo Discrepancy `QMCD `_ +Linear Model REGR Regression Based `REGR `_ +Neural Networks VAE Variational Autoencoder `VAE `_ +Curve-Based WIND Topological Winding Number `WIND `_ +Transformation-Based YJ Yeo-Johnson Transformation `YJ `_ +Normality-Based ZSCORE Z-score `ZSCORE `_ ================================== ================ ================================================================ ==================================================================================================================== diff --git a/pyod/models/base.py b/pyod/models/base.py index a5a4880c..789bd793 100644 --- a/pyod/models/base.py +++ b/pyod/models/base.py @@ -166,7 +166,7 @@ def predict(self, X, return_confidence=False): # if this is a PyThresh object else: - prediction = self.contamination.eval(pred_score) + prediction = self.contamination.predict(pred_score) if return_confidence: confidence = self.predict_confidence(X) @@ -290,7 +290,7 @@ def predict_confidence(self, X): prediction = (test_scores > self.threshold_).astype('int').ravel() # if this is a PyThresh object else: - prediction = self.contamination.eval(test_scores) + prediction = self.contamination.predict(test_scores) np.place(confidence, prediction == 0, 1 - confidence[prediction == 0]) return confidence @@ -574,7 +574,8 @@ def _process_decision_scores(self): # if this is a PyThresh object else: - self.labels_ = self.contamination.eval(self.decision_scores_) + self.contamination.fit(self.decision_scores_) + self.labels_ = self.contamination.labels_ self.threshold_ = self.contamination.thresh_ if not self.threshold_: self.threshold_ = np.sum(self.labels_) / len(self.labels_) diff --git a/pyod/models/thresholds.py b/pyod/models/thresholds.py index 73012009..2e4ca7bc 100755 --- a/pyod/models/thresholds.py +++ b/pyod/models/thresholds.py @@ -5,6 +5,13 @@ def AUCP(**kwargs): to threshold scores generated by the decision_scores where outliers are set to any value beyond where the auc of the kde is less than the (mean + abs(mean-median)) percent of the total kde auc. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.aucp import AUCP as AUCP_thres @@ -47,6 +54,10 @@ def CHAU(**kwargs): - 'mean': Construct a scaler with the mean of the scores - 'median: Construct a scaler with the median of the scores - 'gmean': Construct a scaler with the geometric mean of the scores + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.chau import CHAU as CHAU_thres @@ -68,6 +79,10 @@ def CLF(**kwargs): - 'simple': Uses only the scores - 'complex': Uses the scores, log of the scores, and the scores' PDF + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.clf import CLF as CLF_thres @@ -134,6 +149,10 @@ def CPD(**kwargs): - 'cdf': Use the cumulative distribution function - 'kde': Use the kernel density estimation + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ @@ -215,6 +234,13 @@ def EB(**kwargs): to threshold scores generated by the decision_scores where outliers are set to any value beyond a pseudo-random elliptical boundary set between inliers and outliers. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.eb import EB as EB_thres @@ -229,6 +255,13 @@ def FGD(**kwargs): are set to any value beyond where the first derivative of the kde with respect to the decision scores passes the mean of the first and second inflection points. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.fgd import FGD as FGD_thres @@ -269,6 +302,10 @@ def FILTER(**kwargs): - 'decimate': downsampling factor - 'detrend': number of break points - 'resample': resampling window size + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.filter import FILTER as FILTER_thres @@ -282,12 +319,71 @@ def FWFM(**kwargs): a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the base width. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.fwfm import FWFM as FWFM_thres return FWFM_thres(**kwargs) +def GAMGMM(**kwargs): + """GAMGMM class for gammaGMM thresholder. + + Use a Bayesian method for estimating the posterior distribution + of the contamination factor (i.e., the proportion of anomalies) + for a given unlabeled dataset. The threshold is set such + that the proportion of predicted anomalies equals the + contamination factor. + + Parameters + ---------- + + n_contaminations : int, optional (default=1000) + number of samples to draw from the contamination posterior distribution + + n_draws : int, optional (default=50) + number of samples simultaneously drawn from each DPGMM component + + p0 : float, optional (default=0.01) + probability that no anomalies are in the data + + phigh : float, optional (default=0.01) + probability that there are more than high_gamma anomalies + + high_gamma : float, optional (default=0.15) + sensibly high number of anomalies that has low probability to occur + + gamma_lim : float, optional (default=0.5) + Upper gamma/proportion of anomalies limit + + K : int, optional (default=100) + number of components for DPGMM used to approximate the Dirichlet Process + + skip : bool, optional (default=False) + skip optimal hyperparameter test (this may return a sub-optimal solution) + + steps : int, optional (default=100) + number of iterations to test for optimal hyperparameters + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. + + verbose : bool, optional (default=False) + 20 iterations step printout of the DPGMM process + + """ + + from pythresh.thresholds.gamgmm import GAMGMM as GAMGMM_thres + return GAMGMM_thres(**kwargs) + + def GESD(**kwargs): """GESD class for Generalized Extreme Studentized Deviate thresholder. @@ -299,11 +395,16 @@ def GESD(**kwargs): ---------- max_outliers : int, optional (default='auto') - mamiximum number of outliers that the dataset may have. Default sets + maximum number of outliers that the dataset may have. Default sets max_outliers to be half the size of the dataset alpha : float, optional (default=0.05) significance level + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. + """ from pythresh.thresholds.gesd import GESD as GESD_thres @@ -322,8 +423,8 @@ def HIST(**kwargs): ---------- nbins : int, optional (default='auto') - Number of bins to use in the hostogram, default set to int(len(scores)**0.7) - + Number of bins to use in the histogram, default set to int(len(scores)**0.7) + method : {'otsu', 'yen', 'isodata', 'li', 'minimum', 'triangle'}, optional (default='triangle') Histogram filtering based method @@ -333,6 +434,10 @@ def HIST(**kwargs): - 'li': Li's iterative Minimum Cross Entropy method for filtering - 'minimum': Minimum between two maxima via smoothing method for filtering - 'triangle': Triangle algorithm method for filtering + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.hist import HIST as HIST_thres @@ -346,6 +451,13 @@ def IQR(**kwargs): means to threshold scores generated by the decision_scores where outliers are set to any value beyond the third quartile plus 1.5 times the inter-quartile region. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.iqr import IQR as IQR_thres @@ -371,6 +483,10 @@ def KARCH(**kwargs): - 'simple': Compute the Karcher mean using the 1D array of scores - 'complex': Compute the Karcher mean between a 2D array dot product of the scores and the sorted scores arrays + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.karch import KARCH as KARCH_thres @@ -384,6 +500,17 @@ def MAD(**kwargs): means to threshold scores generated by the decision_scores where outliers are set to any value beyond the mean plus the median absolute deviation over the standard deviation. + + Parameters + ---------- + + factor : int, optional (default=1) + The factor to multiply the MAD by to set the threshold. + The default is 1. + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.mad import MAD as MAD_thres @@ -430,11 +557,48 @@ def META(**kwargs): - 'GNBC': Gaussian Naive Bayes trained classifier meta-model on best contamination - 'GNBM': Gaussian Naive Bayes multivariate trained classifier meta-model + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. + """ from pythresh.thresholds.meta import META as META_thres return META_thres(**kwargs) +def MIXMOD(**kwargs): + """MIXMOD class for the Normal & Non-Normal Mixture Models thresholder. + + Use normal & non-normal mixture models to find a non-parametric means + to threshold scores generated by the decision_scores, where outliers + are set to any value beyond the posterior probability threshold + for equal posteriors of a two distribution mixture model. + + Parameters + ---------- + + method : str, optional (default='mean') + Method to evaluate selecting the best fit mixture model. Default + 'mean' sets this as the closest mixture models to the mean of the posterior + probability threshold for equal posteriors of a two distribution mixture model + for all fits. Setting 'ks' uses the two-sample Kolmogorov-Smirnov test for + goodness of fit. + + tol : float, optional (default=1e-5) + Tolerance for convergence of the EM fit + + max_iter : int, optional (default=250) + Max number of iterations to run EM during fit + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. + + """ + + from pythresh.thresholds.mixmod import MIXMOD as MIXMOD_thres + return MIXMOD_thres(**kwargs) + def MOLL(**kwargs): """MOLL class for Friedrichs' mollifier thresholder. @@ -443,6 +607,12 @@ def MOLL(**kwargs): to threshold scores generated by the decision_scores where outliers are set to any value beyond one minus the maximum of the smoothed dataset via convolution. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the uniform distribution. Can also be set to None. """ from pythresh.thresholds.moll import MOLL as MOLL_thres @@ -459,8 +629,12 @@ def MTT(**kwargs): Parameters ---------- - strictness : [1,2,3,4,5], optional (default=4) - Level of strictness corresponding to the t-Student distribution map to sample + alpha : float, optional (default=0.01) + Confidence level corresponding to the t-Student distribution map to sample + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.mtt import MTT as MTT_thres return MTT_thres(**kwargs) @@ -539,9 +713,14 @@ def QMCD(**kwargs): lim : {'Q', 'P'}, optional (default='P') Filtering method to threshold scores using 1 - discrepancy - - - 'Q': Use quntile limiting + + - 'Q': Use quantile limiting - 'P': Use percentile limiting + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. + """ from pythresh.thresholds.qmcd import QMCD as QMCD_thres @@ -647,6 +826,13 @@ def YJ(**kwargs): a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the max value in the YJ transformed data. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.yj import YJ as YJ_thres @@ -659,6 +845,17 @@ def ZSCORE(**kwargs): Use the zscore to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond a zscore of one. + + Parameters + ---------- + + factor : int, optional (default=1) + The factor to multiply the zscore by to set the threshold. + The default is 1. + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.zscore import ZSCORE as ZSCORE_thres diff --git a/pyod/test/test_thresholds.py b/pyod/test/test_thresholds.py index 2c190d95..e8d6006d 100644 --- a/pyod/test/test_thresholds.py +++ b/pyod/test/test_thresholds.py @@ -30,9 +30,9 @@ class TestThresholds(unittest.TestCase): def setUp(self): from pyod.models.thresholds import (AUCP, BOOT, CHAU, CLF, CLUST, CPD, DECOMP, DSN, EB, FGD, FILTER, - FWFM, GESD, HIST, IQR, KARCH, MAD, - MCST, META, MOLL, MTT, OCSVM, QMCD, - REGR, VAE, WIND, YJ, ZSCORE) + FWFM, GAMGMM, GESD, HIST, IQR, KARCH, + MAD, MCST, META, MIXMOD, MOLL, MTT, + OCSVM, QMCD, REGR, VAE, WIND, YJ, ZSCORE) self.n_train = 200 self.n_test = 100 @@ -45,11 +45,11 @@ def setUp(self): random_state=42, ) - self.contam = [AUCP(), BOOT(), CHAU(), CLF(), CLUST(), - CPD(), DECOMP(), DSN(), EB(), FGD(), FILTER(), - FWFM(), GESD(), HIST(), IQR(), KARCH(), MAD(), - MCST(), META(), MOLL(), MTT(), OCSVM(), QMCD(), - REGR(), VAE(), WIND(), YJ(), ZSCORE()] + self.contam = [AUCP(), BOOT(), CHAU(), CLF(), CLUST(), CPD(), + DECOMP(), DSN(), EB(), FGD(), FILTER(), FWFM(), + GAMGMM(skip=True), GESD(), HIST(), IQR(), KARCH(), + MAD(), MCST(), META(), MIXMOD(), MOLL(), MTT(), + OCSVM(), QMCD(), REGR(), VAE(), WIND(), YJ(), ZSCORE()] for contam in self.contam: self.clf = KDE(contamination=contam)