Project-MONAI
diff --git a/‎MetricsReloaded/metrics/calibration_measures.py‎
Lines changed: 146 additions & 8 deletions b/‎MetricsReloaded/metrics/calibration_measures.py‎
Lines changed: 146 additions & 8 deletions
@@ -31,7 +31,7 @@
 import numpy as np
 import math
 from scipy.special import gamma
-
+import warnings
 # from metrics.pairwise_measures import CacheFunctionOutput
 from MetricsReloaded.utility.utils import (
     CacheFunctionOutput,
@@ -91,7 +91,7 @@ def class_wise_expectation_calibration_error(self):
 
             cwECE = \dfrac{1}{K}\sum_{k=1}^{K}\sum_{i=1}^{N}\dfrac{\vert B_{i,k} \vert}{N} \left(y_{k}(B_{i,k}) - p_{k}(B_{i,k})\right)
 
-
+        :return: cwece
         """
 
         if "bins_ece" in self.dict_args:
@@ -138,11 +138,22 @@ def expectation_calibration_error(self):
         """
         Derives the expectation calibration error in the case of binary task
         bins_ece is the key in the dictionary for the number of bins to consider
+        Cheat sheet SN 3.68 p113
+        Defined in Mahdi Pakdaman Naeini, Gregory Cooper, and Milos Hauskrecht. Obtaining well calibrated probabilities using
+        bayesian binning. In Twenty-Ninth AAAI Conference on Artificial Intelligence, 2015.
         Default is 10
+
+        .. math::
+
+            ECE = \sum_{m=1}^{M} \dfrac{|B_m|}{n}(\dfrac{1}{|B_m|}\sum_{i \in B_m}1(pred_ik==ref_ik)-\dfrac{1}{|B_m|}\sum_{i \in B_m}pred_i)
+
+        :return: ece
+
         """
         if "bins_ece" in self.dict_args:
             nbins = self.dict_args["bins_ece"]
         else:
+            warnings.warn("Bins ECE not specified in optional arguments dictionary - default set to 10")
             nbins = 10
         step = 1.0 / nbins
         range_values = np.arange(0, 1.00001, step)
@@ -169,7 +180,55 @@ def expectation_calibration_error(self):
             else:
                 list_values.append(nsamples * np.abs(prop - np.mean(pred_sel)))
             numb_samples += nsamples
-        return np.sum(np.asarray(list_values)) / numb_samples
+        ece = np.sum(np.asarray(list_values)) / numb_samples
+        return ece
+    
+
+    def maximum_calibration_error(self):
+        """
+        Derives the maximum calibration error in the case of binary task
+        bins_mce is the key in the dictionary for the number of bins to consider
+        Default is 10
+
+        .. math::
+
+            MCE = max(|\dfrac{1}{|B_m|}\sum_{i \in B_m}1(pred_ik==ref_ik)-\dfrac{1}{|B_m|}\sum_{i \in B_m}pred_i|)
+
+        :return: mce
+
+        """
+        if "bins_mce" in self.dict_args:
+            nbins = self.dict_args["bins_mce"]
+        else:
+            warnings.warn("Bins MCE not specified in optional arguments dictionary - default set to 10")
+            nbins = 10
+        step = 1.0 / nbins
+        range_values = np.arange(0, 1.00001, step)
+        list_values = []
+        numb_samples = 0
+        pred_prob = self.pred[:,1]
+        for (l, u) in zip(range_values[:-1], range_values[1:]):
+            ref_tmp = np.where(
+                np.logical_and(pred_prob > l, pred_prob <= u),
+                self.ref,
+                np.ones_like(self.ref) * -1,
+            )
+            ref_sel = ref_tmp[ref_tmp > -1]
+            nsamples = np.size(ref_sel)
+            prop = np.sum(ref_sel) / nsamples
+            pred_tmp = np.where(
+                np.logical_and(pred_prob > l, pred_prob <= u),
+                pred_prob,
+                np.ones_like(pred_prob) * -1,
+            )
+            pred_sel = pred_tmp[pred_tmp > -1]
+            if nsamples == 0:
+                list_values.append(0)
+            else:
+                list_values.append(np.abs(prop - np.mean(pred_sel)))
+        mce = np.max(np.asarray(list_values))
+        return mce
+
 
     def brier_score(self):
         """
@@ -179,22 +238,44 @@ def brier_score(self):
         Glenn W Brier et al. 1950. Verification of forecasts expressed in terms of probability. Monthly weather review 78, 1
         (1950), 1–3.
 
+        .. math::
+
+            BS = \dfrac{1}{N}\sum_{i=1}{N}\sum_{j=1}^{C}(p_{ic}-r_{ic})^2
+
+        where :math: `p_{ic}` is the probability for class c and :math: `r_{ic}` the binary reference for class c and element i
+
         :return: brier score (BS)
+
         """
         bs = np.mean(np.sum(np.square(self.one_hot_ref - self.pred),1))
         return bs
 
     def root_brier_score(self):
         """
+        Determines the root brier score
+
         Gruber S. and Buettner F., Better Uncertainty Calibration via Proper Scores
         for Classification and Beyond, In Proceedings of the 36th International
         Conference on  Neural Information Processing Systems, 2022
+
+        .. math::
+
+            RBS = \sqrt{BS}
+
+        :return: rbs
         """
-        return np.sqrt(self.brier_score())
+        rbs = np.sqrt(self.brier_score())
+        return rbs
 
     def logarithmic_score(self):
         """
         Calculation of the logarithmic score https://en.wikipedia.org/wiki/Scoring_rule
+        
+        .. math::
+
+            LS = 1/N\sum_{i=1}^{N}\log{pred_ik}ref_{ik}
+
+        :return: ls
         """
         eps = 1e-10
         log_pred = np.log(self.pred + eps)
@@ -204,27 +285,48 @@ def logarithmic_score(self):
         return ls
 
     def distance_ij(self,i,j):
+        """
+        Determines the euclidean distance between two vectors of prediction for two samples i and j
+
+        :return: distance
+        """
         pred_i = self.pred[i,:]
         pred_j = self.pred[j,:]
         distance = np.sqrt(np.sum(np.square(pred_i - pred_j)))
         return distance
 
 
     def kernel_calculation(self, i,j):
+        """
+        Defines the kernel value for two samples i and j with the following definition for k(x_i,x_j)
+
+        .. math::
+
+            k(x_i,x_j) = exp(-||x_i-y_j||/ \\nu)I_{N}
+
+        where :math: `\\nu` is the bandwith defined as the median heuristic if not specified in the options and N the number of classes
+
+        :return: kernel_value
+
+        """
         distance = self.distance_ij(i,j)
         if 'bandwidth_kce' in self.dict_args.keys():
             bandwidth = self.dict_args['bandwidth_kce']
         else:
             bandwidth = median_heuristic(self.pred)
         value = np.exp(-distance/bandwidth)
-        identity = np.ones([self.pred.shape[1], self.pred.shape[1]])
-        return value * identity
+        identity = np.eye(self.pred.shape[1])
+        kernel_value = value*identity
+        return kernel_value
 
     def kernel_calibration_error(self):
         """
         Based on the paper Widmann, D., Lindsten, F., and Zachariah, D.
         Calibration tests in multi-class classification: A unifying framework.
         Advances in Neural Information Processing Systems, 32:12257–12267, 2019.
+
+        :return: kce
+
         """
         one_hot_ref = one_hot_encode(self.ref, self.pred.shape[1])
         numb_samples = self.pred.shape[0]
@@ -246,6 +348,9 @@ def top_label_classification_error(self):
         """
         Calculation of the top-label classification error. Assumes pred_proba a matrix K x Numb observations
         with probability to be in class k for observation i in position (k,i)
+
+        :return: tce
+
         """
         class_max = np.argmax(self.pred, 1)
         prob_pred_max = np.max(self.pred, 1)
@@ -271,7 +376,12 @@ def kernel_based_ece(self):
         Teodora Popordanoska, Raphael Sayer, and Matthew B Blaschko. 2022. A Consistent and Differentiable Lp Canonical
         Calibration Error Estimator. In Advances in Neural Information Processing Systems.
 
+        .. math::
+ 
+            ECE\_KDE = 1/N \sum_{j=1}^{N}||\dfrac{\sum_{i \\neq j}k_{Dir}(pred_j,pred_i)ref_i}{\sum_{i \\neq j}k_{Dir}(pred_j,pred_i)} - pred_j || 
+
         :return: ece_kde
+
         """
         ece_kde = 0
         one_hot_ref = one_hot_encode(self.ref, self.pred.shape[1])
@@ -298,6 +408,18 @@ def kernel_based_ece(self):
         return ece_kde
 
     def gamma_ik(self, i, k):
+        """
+        Definition of gamma value for sample i class k of the predictions
+
+        .. math::
+
+            gamma_{ik} = \Gamma(pred_{ik}/h + 1)
+
+        where h is the bandwidth value set as default to 0.5
+
+        :return gamma_ik
+
+        """
         pred_ik = self.pred[i, k]
         if "bandwidth" in self.dict_args.keys():
             h = self.dict_args["bandwidth"]
@@ -308,6 +430,16 @@ def gamma_ik(self, i, k):
         return gamma_ik
 
     def dirichlet_kernel(self, j, i):
+        """
+        Calculation of Dirichlet kernel value for predictions of samples i and j
+
+        .. math::
+
+            k_{Dir}(x_j,x_i) = \dfrac{\Gamma(\sum_{k=1}^{K}\\alpha_{ik})}{\prod_{k=1}^{K}\\alpha_{ik}}\prod_{k=1}^{K}x_jk^{\\alpha_{ik}-1}
+        
+        :return: kernel_value
+
+        """
         pred_i = self.pred[i, :]
         pred_j = self.pred[j, :]
         nclasses = self.pred.shape[1]
@@ -331,16 +463,22 @@ def negative_log_likelihood(self):
 
         George Cybenko, Dianne P O’Leary, and Jorma Rissanen. 1998. The Mathematics of Information Coding, Extraction
         and Distribution. Vol. 107. Springer Science & Business Media.
+        Cheat Sheet p 116 - Figure SN 3.71
 
         .. math::
 
-            -\sum_{i=1}{N} log(p_{i,k} | y_i=k)
+            NLL = -\dfrac{1}{N}\sum_{i=1}^{N}\sum_{k=1}^{C} y_{ik} \dot log(p_{i,k})
+
+        where :math: `y_{ik}` the outcome is 1 if the class of :math: `y_{i}` is k and :math: `p_{ik}` is the predicted 
+        probability for sample :math: `x_i` and class k
+
+        :return: NLL
 
         """
         log_pred = np.log(self.pred)
         numb_samples = self.pred.shape[0]
         ll = np.sum(log_pred[range(numb_samples), self.ref])
-        nll = -1 * ll
+        nll = -1/numb_samples * ll
         return nll
 
     def to_dict_meas(self, fmt="{:.4f}"):