Re-implement GMMHMM based upon:

blckmaxima · blckmaxima · commit af69d424b371 · 2025-02-01T18:01:10.000-08:00
Watanabe, Shinji, and Jen-Tzung Chien. Bayesian Speech and Language
    Processing. Cambridge University Press, 2015.
which appears simpler than the current implementation. This will make the
VariationalGMMHMM Easier to implement later on.
diff --git a/src/hmmlearn/_emissions.py b/src/hmmlearn/_emissions.py
@@ -134,10 +134,11 @@ def _initialize_sufficient_statistics(self):
         stats = super()._initialize_sufficient_statistics()
         stats['post'] = np.zeros(self.n_components)
         stats['obs'] = np.zeros((self.n_components, self.n_features))
-        stats['obs**2'] = np.zeros((self.n_components, self.n_features))
         if self.covariance_type in ('tied', 'full'):
             stats['obs*obs.T'] = np.zeros((self.n_components, self.n_features,
                                            self.n_features))
+        elif self.covariance_type in ('diag', 'spherical'):
+            stats['obs**2'] = np.zeros((self.n_components, self.n_features))
         return stats
 
     def _accumulate_sufficient_statistics(
@@ -181,7 +182,7 @@ def _generate_sample_from_state(self, state, random_state):
         )
 
 
-class BaseGMMHMM(BaseHMM):
+class BaseGMMHMM(_AbstractHMM):
 
     def _get_n_fit_scalars_per_param(self):
         nc = self.n_components
@@ -222,11 +223,9 @@ def _compute_log_likelihood(self, X):
     def _initialize_sufficient_statistics(self):
         stats = super()._initialize_sufficient_statistics()
         stats['post_mix_sum'] = np.zeros((self.n_components, self.n_mix))
-        stats['post_sum'] = np.zeros(self.n_components)
-
         if 'm' in self.params:
-            lambdas, mus = self.means_weight, self.means_prior
-            stats['m_n'] = lambdas[:, :, None] * mus
+            stats['m_n'] = np.zeros(
+                (self.n_components, self.n_mix, self.n_features))
         if 'c' in self.params:
             stats['c_n'] = np.zeros_like(self.covars_)
 
@@ -254,7 +253,7 @@ def _accumulate_sufficient_statistics(self, stats, X, lattice,
 
         post_mix = np.zeros((n_samples, self.n_components, self.n_mix))
         for p in range(self.n_components):
-            log_denses = self._compute_log_weighted_gaussian_densities(X, p)
+            log_denses = self._log_density_for_sufficient_statistics(X, p)
             log_normalize(log_denses, axis=-1)
             with np.errstate(under="ignore"):
                 post_mix[:, p, :] = np.exp(log_denses)
@@ -263,33 +262,23 @@ def _accumulate_sufficient_statistics(self, stats, X, lattice,
             post_comp_mix = post_comp[:, :, None] * post_mix
 
         stats['post_mix_sum'] += post_comp_mix.sum(axis=0)
-        stats['post_sum'] += post_comp.sum(axis=0)
-
         if 'm' in self.params:  # means stats
             stats['m_n'] += np.einsum('ijk,il->jkl', post_comp_mix, X)
 
         if 'c' in self.params:  # covariance stats
-            centered = X[:, None, None, :] - self.means_
-
-            def outer_f(x):  # Outer product over features.
-                return x[..., :, None] * x[..., None, :]
-
-            if self.covariance_type == 'full':
-                centered_dots = outer_f(centered)
-                c_n = np.einsum('ijk,ijklm->jklm', post_comp_mix,
-                                centered_dots)
-            elif self.covariance_type == 'diag':
-                centered2 = np.square(centered, out=centered)  # reuse
-                c_n = np.einsum('ijk,ijkl->jkl', post_comp_mix, centered2)
-            elif self.covariance_type == 'spherical':
-                # Faster than (x**2).sum(-1).
-                centered_norm2 = np.einsum('...i,...i', centered, centered)
-                c_n = np.einsum('ijk,ijk->jk', post_comp_mix, centered_norm2)
-            elif self.covariance_type == 'tied':
-                centered_dots = outer_f(centered)
-                c_n = np.einsum('ijk,ijklm->jlm', post_comp_mix, centered_dots)
-
-            stats['c_n'] += c_n
+            if self.covariance_type == "full":
+                stats['c_n'] += np.einsum(
+                    'ijk,il,im->jklm', post_comp_mix, X, X)
+            elif self.covariance_type == "tied":
+                stats['c_n'] += np.einsum(
+                    'ijk,il,im->jlm', post_comp_mix, X, X)
+            elif self.covariance_type == "diag":
+                stats['c_n'] += np.einsum(
+                    'ijk,il->jkl', post_comp_mix, X**2)
+            elif self.covariance_type == "spherical":
+                stats['c_n'] += np.einsum(
+                    'ijk,il->jk', post_comp_mix, X**2)
+
 
     def _generate_sample_from_state(self, state, random_state):
         cur_weights = self.weights_[state]
diff --git a/src/hmmlearn/hmm.py b/src/hmmlearn/hmm.py
@@ -387,10 +387,23 @@ def _do_mstep(self, stats):
                                      (cvweight + stats['post'][:, None, None]))
 
 
-class GMMHMM(_emissions.BaseGMMHMM):
+class GMMHMM(_emissions.BaseGMMHMM, BaseHMM):
     """
     Hidden Markov Model with Gaussian mixture emissions.
 
+    Note:
+        The implementation supports both Maximum Likelihood Estimation(MLE)
+        and Maximum a-posteriori (MAP) approximation. By default, the various
+        priors are configered such that the MLE is learned. To configure the
+        model to make MAP estimatation, set the various priors to 0.
+
+        This implementation is based upon:
+            Watanabe, Shinji, and Jen-Tzung Chien. Bayesian Speech and Language
+            Processing. Cambridge University Press, 2015.
+
+    TODO:
+        Sources for MAP priors for spherical and tied covariance
+
     Attributes
     ----------
     monitor_ : ConvergenceMonitor
@@ -417,6 +430,7 @@ class GMMHMM(_emissions.BaseGMMHMM):
         * (n_components, n_mix, n_features)              if "diag",
         * (n_components, n_mix, n_features, n_features)  if "full"
         * (n_components, n_features, n_features)         if "tied".
+
     """
 
     def __init__(self, n_components=1, n_mix=1,
@@ -592,25 +606,29 @@ def compute_cv():
 
     def _init_covar_priors(self):
         if self.covariance_type == "full":
+            # Pages 157 of Bayesian Speech and Language Processing
             if self.covars_prior is None:
                 self.covars_prior = 0.0
             if self.covars_weight is None:
-                self.covars_weight = -(1.0 + self.n_features + 1.0)
+                self.covars_weight = (1.0 + self.n_features)
         elif self.covariance_type == "tied":
+            # TODO - Source for these
             if self.covars_prior is None:
                 self.covars_prior = 0.0
             if self.covars_weight is None:
                 self.covars_weight = -(self.n_mix + self.n_features + 1.0)
         elif self.covariance_type == "diag":
+            # Pages 158 of Bayesian Speech and Language Processing
             if self.covars_prior is None:
-                self.covars_prior = -1.5
+                self.covars_prior = 0
             if self.covars_weight is None:
-                self.covars_weight = 0.0
+                self.covars_weight = 2
         elif self.covariance_type == "spherical":
+            # TODO - Source for these
             if self.covars_prior is None:
-                self.covars_prior = -(self.n_mix + 2.0) / 2.0
+                self.covars_prior = 0
             if self.covars_weight is None:
-                self.covars_weight = 0.0
+                self.covars_weight = -(self.n_mix + 2.0) / 2.0
 
     def _fix_priors_shape(self):
         nc = self.n_components
@@ -731,6 +749,9 @@ def _check(self):
                         _log.warning("Covariance of state #%d, mixture #%d "
                                      "has a null eigenvalue.", i, j)
 
+    def _log_density_for_sufficient_statistics(self, X, component):
+        return self._compute_log_weighted_gaussian_densities(X, component)
+
     def _do_mstep(self, stats):
         super()._do_mstep(stats)
         nf = self.n_features
@@ -740,12 +761,16 @@ def _do_mstep(self, stats):
         if 'w' in self.params:
             alphas_minus_one = self.weights_prior - 1
             w_n = stats['post_mix_sum'] + alphas_minus_one
-            w_d = (stats['post_sum'] + alphas_minus_one.sum(axis=1))[:, None]
+            w_d = w_n.sum(axis=-1)[:, None]
             self.weights_ = w_n / w_d
 
         # Maximizing means
         if 'm' in self.params:
-            m_n = stats['m_n']
+            m_n = stats['m_n'] + np.einsum(
+                "cm,cmi->cmi",
+                self.means_weight,
+                self.means_prior
+            )
             m_d = stats['post_mix_sum'] + self.means_weight
             # If a componenent has zero weight, then replace nan (0/0?) means
             # by 0 (0/1).  The actual value is irrelevant as the component will
@@ -757,57 +782,66 @@ def _do_mstep(self, stats):
 
         # Maximizing covariances
         if 'c' in self.params:
-            lambdas, mus = self.means_weight, self.means_prior
-            centered_means = self.means_ - mus
-
-            def outer_f(x):  # Outer product over features.
-                return x[..., :, None] * x[..., None, :]
-
             if self.covariance_type == 'full':
-                centered_means_dots = outer_f(centered_means)
-
-                psis_t = np.transpose(self.covars_prior, axes=(0, 1, 3, 2))
-                nus = self.covars_weight
-
-                c_n = psis_t + lambdas[:, :, None, None] * centered_means_dots
-                c_n += stats['c_n']
-                c_d = (
-                    stats['post_mix_sum'] + 1 + nus + nf + 1
-                )[:, :, None, None]
-
+                # Pages 156-157 of Bayesian Speech and Language Processing
+                c_n = (self.covars_prior
+                       + stats['c_n']
+                       + np.einsum("ck,cki,ckj->ckij",
+                                   self.means_weight,
+                                   self.means_prior,
+                                   self.means_prior)
+                       - np.einsum("ck,cki,ckj->ckij",
+                                   stats['post_mix_sum'] + self.means_weight,
+                                   self.means_,
+                                   self.means_))
+                # Note that when self.covars_weight = 0
+                # and c_d <= 0, then we will have a failure. This is discussed
+                # on page 156 of the above book.
+                c_d = stats['post_mix_sum'] + self.covars_weight
+                c_d -= self.n_features - 1
+                c_d = c_d[:, :, None, None]
+            elif self.covariance_type == 'tied':
+                # inferred from 'full'
+                c_n = (self.covars_prior
+                       + stats['c_n']
+                       + np.einsum("ck,cki,ckj->cij",
+                                   self.means_weight,
+                                   self.means_prior,
+                                   self.means_prior)
+                       - np.einsum("ck,cki,ckj->cij",
+                                   stats['post_mix_sum'] + self.means_weight,
+                                   self.means_,
+                                   self.means_))
+                c_d = stats['post_mix_sum'].sum(axis=-1) + self.covars_weight
+                c_d += (nm + nf + 1.0)
+                c_d = c_d[:, None, None]
             elif self.covariance_type == 'diag':
-                alphas = self.covars_prior
-                betas = self.covars_weight
-                centered_means2 = centered_means ** 2
-
-                c_n = lambdas[:, :, None] * centered_means2 + 2 * betas
-                c_n += stats['c_n']
-                c_d = stats['post_mix_sum'][:, :, None] + 1 + 2 * (alphas + 1)
-
+                # Pages 157-158 of Bayesian Speech and Language Processing
+                c_n = (self.covars_prior
+                       + stats['c_n']
+                       + np.einsum("ck,cki->cki",
+                                   self.means_weight,
+                                   self.means_prior**2)
+                       - np.einsum("ck,cki->cki",
+                                   stats['post_mix_sum'] + self.means_weight,
+                                   self.means_**2))
+                c_d = (stats['post_mix_sum'][:, :, None]
+                       + self.covars_weight
+                       - 2)
             elif self.covariance_type == 'spherical':
-                centered_means_norm2 = np.einsum(  # Faster than (x**2).sum(-1)
-                    '...i,...i', centered_means, centered_means)
-
-                alphas = self.covars_prior
-                betas = self.covars_weight
-
-                c_n = lambdas * centered_means_norm2 + 2 * betas
-                c_n += stats['c_n']
-                c_d = nf * (stats['post_mix_sum'] + 1) + 2 * (alphas + 1)
-
-            elif self.covariance_type == 'tied':
-                centered_means_dots = outer_f(centered_means)
-
-                psis_t = np.transpose(self.covars_prior, axes=(0, 2, 1))
-                nus = self.covars_weight
-
-                c_n = np.einsum('ij,ijkl->ikl',
-                                lambdas, centered_means_dots) + psis_t
-                c_n += stats['c_n']
-                c_d = (stats['post_sum'] + nm + nus + nf + 1)[:, None, None]
+                # inferred from 'diag'
+                c_n = (self.covars_prior
+                       + stats['c_n']
+                       + np.einsum("ck,cki->ck",
+                                   self.means_weight,
+                                   self.means_prior**2)
+                       - np.einsum("ck,cki->ck",
+                                   stats['post_mix_sum'] + self.means_weight,
+                                   self.means_**2)) / nf
+                c_d = stats['post_mix_sum'] + self.covars_weight + (nm + 2)/2
 
             self.covars_ = c_n / c_d
-
+            assert not np.isnan(self.covars_).any(), self.covars_
 
 class MultinomialHMM(_emissions.BaseMultinomialHMM):
     """
diff --git a/src/hmmlearn/tests/test_gmm_hmm_new.py b/src/hmmlearn/tests/test_gmm_hmm_new.py
@@ -260,6 +260,6 @@ def test_chunked(sellf, covtype, init_params='mcw'):
         model2.fit(data, lengths=[200] * 5)
 
         assert_allclose(model1.means_, model2.means_, rtol=0, atol=1e-2)
-        assert_allclose(model1.covars_, model2.covars_, rtol=0, atol=1e-3)
-        assert_allclose(model1.weights_, model2.weights_, rtol=0, atol=1e-3)
+        assert_allclose(model1.covars_, model2.covars_, rtol=0, atol=1e-2)
+        assert_allclose(model1.weights_, model2.weights_, rtol=0, atol=1e-2)
         assert_allclose(model1.transmat_, model2.transmat_, rtol=0, atol=1e-2)