Skip to content

Mixture Models

sklego.mixture.bayesian_gmm_classifier.BayesianGMMClassifier

Bases: BaseEstimator, ClassifierMixin

The BayesianGMMClassifier trains a Gaussian Mixture Model for each class in y on a dataset X. Once a density is trained for each class we can evaluate the likelihood scores to see which class is more likely.

Note

All the parameters are an exact copy of those of sklearn.mixture.BayesianGaussianMixture.

Attributes:

Name Type Description
gmms_ dict[int, BayesianGaussianMixture]

A dictionary of Bayesian Gaussian Mixture Models, one for each class.

classes_ np.ndarray of shape (n_classes,)

The classes seen during fit.

Source code in sklego/mixture/bayesian_gmm_classifier.py
class BayesianGMMClassifier(BaseEstimator, ClassifierMixin):
    """The `BayesianGMMClassifier` trains a Gaussian Mixture Model for each class in `y` on a dataset `X`.
    Once a density is trained for each class we can evaluate the likelihood scores to see which class is more likely.

    !!! note
        All the parameters are an exact copy of those of
        [sklearn.mixture.BayesianGaussianMixture]( https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html).


    Attributes
    ----------
    gmms_ : dict[int, BayesianGaussianMixture]
        A dictionary of Bayesian Gaussian Mixture Models, one for each class.
    classes_ : np.ndarray of shape (n_classes,)
        The classes seen during `fit`.
    """

    def __init__(
        self,
        n_components=1,
        covariance_type="full",
        tol=0.001,
        reg_covar=1e-06,
        max_iter=100,
        n_init=1,
        init_params="kmeans",
        weight_concentration_prior_type="dirichlet_process",
        weight_concentration_prior=None,
        mean_precision_prior=None,
        mean_prior=None,
        degrees_of_freedom_prior=None,
        covariance_prior=None,
        random_state=None,
        warm_start=False,
        verbose=0,
        verbose_interval=10,
    ):
        self.n_components = n_components
        self.covariance_type = covariance_type
        self.tol = tol
        self.reg_covar = reg_covar
        self.max_iter = max_iter
        self.n_init = n_init
        self.init_params = init_params
        self.weight_concentration_prior_type = weight_concentration_prior_type
        self.weight_concentration_prior = weight_concentration_prior
        self.mean_precision_prior = mean_precision_prior
        self.mean_prior = mean_prior
        self.degrees_of_freedom_prior = degrees_of_freedom_prior
        self.covariance_prior = covariance_prior
        self.random_state = random_state
        self.warm_start = warm_start
        self.verbose = verbose
        self.verbose_interval = verbose_interval

    def fit(self, X: np.ndarray, y: np.ndarray) -> "BayesianGMMClassifier":
        """Fit the `BayesianGMMClassifier` model using `X`, `y` as training data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features )
            The training data.
        y : array-like of shape (n_samples,)
            The target values.

        Returns
        -------
        self : BayesianGMMClassifier
            The fitted estimator.
        """
        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
        if X.ndim == 1:
            X = np.expand_dims(X, 1)

        self.gmms_ = {}
        self.classes_ = unique_labels(y)
        for c in self.classes_:
            subset_x, subset_y = X[y == c], y[y == c]
            mixture = BayesianGaussianMixture(
                n_components=self.n_components,
                covariance_type=self.covariance_type,
                tol=self.tol,
                reg_covar=self.reg_covar,
                max_iter=self.max_iter,
                n_init=self.n_init,
                init_params=self.init_params,
                weight_concentration_prior_type=self.weight_concentration_prior_type,
                weight_concentration_prior=self.weight_concentration_prior,
                mean_precision_prior=self.mean_precision_prior,
                mean_prior=self.mean_prior,
                degrees_of_freedom_prior=self.degrees_of_freedom_prior,
                covariance_prior=self.covariance_prior,
                random_state=self.random_state,
                warm_start=self.warm_start,
                verbose=self.verbose,
                verbose_interval=self.verbose_interval,
            )
            self.gmms_[c] = mixture.fit(subset_x, subset_y)
        return self

    def predict(self, X):
        """Predict labels for `X` using fitted estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data to predict.

        Returns
        -------
        array-like of shape (n_samples,)
            The predicted data.
        """
        check_is_fitted(self, ["gmms_", "classes_"])
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        return self.classes_[self.predict_proba(X).argmax(axis=1)]

    def predict_proba(self, X):
        """Predict probabilities for `X` using fitted estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data to predict.

        Returns
        -------
        array-like of shape (n_samples, n_classes)
            The predicted probabilities.
        """
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["gmms_", "classes_"])
        res = np.zeros((X.shape[0], self.classes_.shape[0]))
        for idx, c in enumerate(self.classes_):
            res[:, idx] = self.gmms_[c].score_samples(X)
        return softmax(res, axis=1)

fit(X, y)

Fit the BayesianGMMClassifier model using X, y as training data.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features )

The training data.

required
y array-like of shape (n_samples,)

The target values.

required

Returns:

Name Type Description
self BayesianGMMClassifier

The fitted estimator.

Source code in sklego/mixture/bayesian_gmm_classifier.py
def fit(self, X: np.ndarray, y: np.ndarray) -> "BayesianGMMClassifier":
    """Fit the `BayesianGMMClassifier` model using `X`, `y` as training data.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features )
        The training data.
    y : array-like of shape (n_samples,)
        The target values.

    Returns
    -------
    self : BayesianGMMClassifier
        The fitted estimator.
    """
    X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
    if X.ndim == 1:
        X = np.expand_dims(X, 1)

    self.gmms_ = {}
    self.classes_ = unique_labels(y)
    for c in self.classes_:
        subset_x, subset_y = X[y == c], y[y == c]
        mixture = BayesianGaussianMixture(
            n_components=self.n_components,
            covariance_type=self.covariance_type,
            tol=self.tol,
            reg_covar=self.reg_covar,
            max_iter=self.max_iter,
            n_init=self.n_init,
            init_params=self.init_params,
            weight_concentration_prior_type=self.weight_concentration_prior_type,
            weight_concentration_prior=self.weight_concentration_prior,
            mean_precision_prior=self.mean_precision_prior,
            mean_prior=self.mean_prior,
            degrees_of_freedom_prior=self.degrees_of_freedom_prior,
            covariance_prior=self.covariance_prior,
            random_state=self.random_state,
            warm_start=self.warm_start,
            verbose=self.verbose,
            verbose_interval=self.verbose_interval,
        )
        self.gmms_[c] = mixture.fit(subset_x, subset_y)
    return self

predict(X)

Predict labels for X using fitted estimator.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features)

The data to predict.

required

Returns:

Type Description
array-like of shape (n_samples,)

The predicted data.

Source code in sklego/mixture/bayesian_gmm_classifier.py
def predict(self, X):
    """Predict labels for `X` using fitted estimator.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        The data to predict.

    Returns
    -------
    array-like of shape (n_samples,)
        The predicted data.
    """
    check_is_fitted(self, ["gmms_", "classes_"])
    X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
    return self.classes_[self.predict_proba(X).argmax(axis=1)]

predict_proba(X)

Predict probabilities for X using fitted estimator.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features)

The data to predict.

required

Returns:

Type Description
array-like of shape (n_samples, n_classes)

The predicted probabilities.

Source code in sklego/mixture/bayesian_gmm_classifier.py
def predict_proba(self, X):
    """Predict probabilities for `X` using fitted estimator.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        The data to predict.

    Returns
    -------
    array-like of shape (n_samples, n_classes)
        The predicted probabilities.
    """
    X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
    check_is_fitted(self, ["gmms_", "classes_"])
    res = np.zeros((X.shape[0], self.classes_.shape[0]))
    for idx, c in enumerate(self.classes_):
        res[:, idx] = self.gmms_[c].score_samples(X)
    return softmax(res, axis=1)

sklego.mixture.bayesian_gmm_detector.BayesianGMMOutlierDetector

Bases: OutlierMixin, BaseEstimator

The BayesianGMMOutlierDetector trains a Bayesian Gaussian Mixture model on a dataset X. Once a density is trained we can evaluate the likelihood scores to see if it is deemed likely.

By providing a threshold this model might then label outliers if their likelihood score is too low.

Note

The parameters other than threshold and method are an exact copy of the parameters in sklearn.mixture.BayesianGaussianMixture.

Parameters:

Name Type Description Default
threshold float

The limit at which the model thinks an outlier appears, must be between (0, 1).

0.99
method Literal[quantile, stddev]

The method to use to apply the threshold.

Info

If you select method="quantile" then the threshold value represents the quantile value to start calling something an outlier.

If you select method="stddev" then the threshold value represents the numbers of standard deviations before calling something an outlier.

"quantile"

Attributes:

Name Type Description
gmm_ BayesianGaussianMixture

The trained Bayesian Gaussian Mixture Model.

likelihood_threshold_ float

The threshold value used to determine if something is an outlier.

Source code in sklego/mixture/bayesian_gmm_detector.py
class BayesianGMMOutlierDetector(OutlierMixin, BaseEstimator):
    """The `BayesianGMMOutlierDetector` trains a Bayesian Gaussian Mixture model on a dataset `X`. Once a density is
    trained we can evaluate the likelihood scores to see if it is deemed likely.

    By providing a `threshold` this model might then label outliers if their likelihood score is too low.

    !!! note
        The parameters other than `threshold` and `method` are an exact copy of the parameters in
        [sklearn.mixture.BayesianGaussianMixture]( https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html).

    Parameters
    ----------
    threshold : float, default=0.99
        The limit at which the model thinks an outlier appears, must be between (0, 1).
    method : Literal["quantile", "stddev"], default="quantile"
        The method to use to apply the `threshold`.

        !!! info
            If you select `method="quantile"` then the threshold value represents the quantile value to start calling
            something an outlier.

            If you select `method="stddev"` then the threshold value represents the
            numbers of standard deviations before calling something an outlier.

    Attributes
    ----------
    gmm_ : BayesianGaussianMixture
        The trained Bayesian Gaussian Mixture Model.
    likelihood_threshold_ : float
        The threshold value used to determine if something is an outlier.
    """

    _ALLOWED_METHODS = ("quantile", "stddev")

    def __init__(
        self,
        threshold=0.99,
        method="quantile",
        n_components=1,
        covariance_type="full",
        tol=0.001,
        reg_covar=1e-06,
        max_iter=100,
        n_init=1,
        init_params="kmeans",
        weight_concentration_prior_type="dirichlet_process",
        weight_concentration_prior=None,
        mean_precision_prior=None,
        mean_prior=None,
        degrees_of_freedom_prior=None,
        covariance_prior=None,
        random_state=None,
        warm_start=False,
        verbose=0,
        verbose_interval=10,
    ):
        self.threshold = threshold
        self.method = method

        self.n_components = n_components
        self.covariance_type = covariance_type
        self.tol = tol
        self.reg_covar = reg_covar
        self.max_iter = max_iter
        self.n_init = n_init
        self.init_params = init_params
        self.weight_concentration_prior_type = weight_concentration_prior_type
        self.weight_concentration_prior = weight_concentration_prior
        self.mean_precision_prior = mean_precision_prior
        self.mean_prior = mean_prior
        self.degrees_of_freedom_prior = degrees_of_freedom_prior
        self.covariance_prior = covariance_prior
        self.random_state = random_state
        self.warm_start = warm_start
        self.verbose = verbose
        self.verbose_interval = verbose_interval

    def fit(self, X: np.ndarray, y=None) -> "BayesianGMMOutlierDetector":
        """Fit the `BayesianGMMOutlierDetector` model using `X`, `y` as training data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features )
            The training data.
        y : array-like of shape (n_samples,)
            Ignored, present for compatibility.

        Returns
        -------
        self : BayesianGMMOutlierDetector
            The fitted estimator.

        Raises
        ------
        ValueError
            - If `method="quantile"` and `threshold` is not between (0, 1).
            - If `method="stddev"` and `threshold` is negative.
            - If `method` is not in `["quantile", "stddev"]`.
        """

        # GMM sometimes throws an error if you don't do this
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        if len(X.shape) == 1:
            X = np.expand_dims(X, 1)

        if (self.method == "quantile") and ((self.threshold > 1) or (self.threshold < 0)):
            raise ValueError(f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold < 1")
        if (self.method == "stddev") and (self.threshold < 0):
            raise ValueError(f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold ")
        if self.method not in self._ALLOWED_METHODS:
            raise ValueError(f"Method not recognised. Method must be in {self._ALLOWED_METHODS}")

        self.gmm_ = BayesianGaussianMixture(
            n_components=self.n_components,
            covariance_type=self.covariance_type,
            tol=self.tol,
            reg_covar=self.reg_covar,
            max_iter=self.max_iter,
            n_init=self.n_init,
            init_params=self.init_params,
            weight_concentration_prior_type=self.weight_concentration_prior_type,
            weight_concentration_prior=self.weight_concentration_prior,
            mean_precision_prior=self.mean_precision_prior,
            mean_prior=self.mean_prior,
            degrees_of_freedom_prior=self.degrees_of_freedom_prior,
            covariance_prior=self.covariance_prior,
            random_state=self.random_state,
            warm_start=self.warm_start,
            verbose=self.verbose,
            verbose_interval=self.verbose_interval,
        )
        self.gmm_.fit(X)
        score_samples = self.gmm_.score_samples(X)

        if self.method == "quantile":
            self.likelihood_threshold_ = np.quantile(score_samples, 1 - self.threshold)

        if self.method == "stddev":
            density = gaussian_kde(score_samples)
            max_x_value = minimize_scalar(lambda x: -density(x)).x
            mean_likelihood = score_samples.mean()
            new_likelihoods = score_samples[score_samples < max_x_value]
            new_likelihoods_std = np.std(new_likelihoods - mean_likelihood)
            self.likelihood_threshold_ = mean_likelihood - (self.threshold * new_likelihoods_std)

        return self

    def score_samples(self, X):
        """Compute the log likelihood for each sample and return the negative value."""
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["gmm_", "likelihood_threshold_"])
        if len(X.shape) == 1:
            X = np.expand_dims(X, 1)

        return self.gmm_.score_samples(X) * -1

    def decision_function(self, X):
        # We subtract self.offset_ to make 0 be the threshold value for being an outlier:
        return self.score_samples(X) + self.likelihood_threshold_

    def predict(self, X):
        """Predict if a point is an outlier or not using the fitted estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data to predict.

        Returns
        -------
        array-like of shape (n_samples,)
            The predicted data. 1 for inliers, -1 for outliers.
        """
        predictions = (self.decision_function(X) >= 0).astype(int)
        predictions[predictions == 1] = -1
        predictions[predictions == 0] = 1
        return predictions

    @property
    def allowed_methods(self):
        warn(
            "Please use `_ALLOWED_METHODS` instead of `allowed_methods`,"
            "`allowed_methods` will be deprecated in future versions",
            DeprecationWarning,
        )
        return self._ALLOWED_METHODS

fit(X, y=None)

Fit the BayesianGMMOutlierDetector model using X, y as training data.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features )

The training data.

required
y array-like of shape (n_samples,)

Ignored, present for compatibility.

None

Returns:

Name Type Description
self BayesianGMMOutlierDetector

The fitted estimator.

Raises:

Type Description
ValueError
  • If method="quantile" and threshold is not between (0, 1).
  • If method="stddev" and threshold is negative.
  • If method is not in ["quantile", "stddev"].
Source code in sklego/mixture/bayesian_gmm_detector.py
def fit(self, X: np.ndarray, y=None) -> "BayesianGMMOutlierDetector":
    """Fit the `BayesianGMMOutlierDetector` model using `X`, `y` as training data.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features )
        The training data.
    y : array-like of shape (n_samples,)
        Ignored, present for compatibility.

    Returns
    -------
    self : BayesianGMMOutlierDetector
        The fitted estimator.

    Raises
    ------
    ValueError
        - If `method="quantile"` and `threshold` is not between (0, 1).
        - If `method="stddev"` and `threshold` is negative.
        - If `method` is not in `["quantile", "stddev"]`.
    """

    # GMM sometimes throws an error if you don't do this
    X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
    if len(X.shape) == 1:
        X = np.expand_dims(X, 1)

    if (self.method == "quantile") and ((self.threshold > 1) or (self.threshold < 0)):
        raise ValueError(f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold < 1")
    if (self.method == "stddev") and (self.threshold < 0):
        raise ValueError(f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold ")
    if self.method not in self._ALLOWED_METHODS:
        raise ValueError(f"Method not recognised. Method must be in {self._ALLOWED_METHODS}")

    self.gmm_ = BayesianGaussianMixture(
        n_components=self.n_components,
        covariance_type=self.covariance_type,
        tol=self.tol,
        reg_covar=self.reg_covar,
        max_iter=self.max_iter,
        n_init=self.n_init,
        init_params=self.init_params,
        weight_concentration_prior_type=self.weight_concentration_prior_type,
        weight_concentration_prior=self.weight_concentration_prior,
        mean_precision_prior=self.mean_precision_prior,
        mean_prior=self.mean_prior,
        degrees_of_freedom_prior=self.degrees_of_freedom_prior,
        covariance_prior=self.covariance_prior,
        random_state=self.random_state,
        warm_start=self.warm_start,
        verbose=self.verbose,
        verbose_interval=self.verbose_interval,
    )
    self.gmm_.fit(X)
    score_samples = self.gmm_.score_samples(X)

    if self.method == "quantile":
        self.likelihood_threshold_ = np.quantile(score_samples, 1 - self.threshold)

    if self.method == "stddev":
        density = gaussian_kde(score_samples)
        max_x_value = minimize_scalar(lambda x: -density(x)).x
        mean_likelihood = score_samples.mean()
        new_likelihoods = score_samples[score_samples < max_x_value]
        new_likelihoods_std = np.std(new_likelihoods - mean_likelihood)
        self.likelihood_threshold_ = mean_likelihood - (self.threshold * new_likelihoods_std)

    return self

predict(X)

Predict if a point is an outlier or not using the fitted estimator.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features)

The data to predict.

required

Returns:

Type Description
array-like of shape (n_samples,)

The predicted data. 1 for inliers, -1 for outliers.

Source code in sklego/mixture/bayesian_gmm_detector.py
def predict(self, X):
    """Predict if a point is an outlier or not using the fitted estimator.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        The data to predict.

    Returns
    -------
    array-like of shape (n_samples,)
        The predicted data. 1 for inliers, -1 for outliers.
    """
    predictions = (self.decision_function(X) >= 0).astype(int)
    predictions[predictions == 1] = -1
    predictions[predictions == 0] = 1
    return predictions

score_samples(X)

Compute the log likelihood for each sample and return the negative value.

Source code in sklego/mixture/bayesian_gmm_detector.py
def score_samples(self, X):
    """Compute the log likelihood for each sample and return the negative value."""
    X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
    check_is_fitted(self, ["gmm_", "likelihood_threshold_"])
    if len(X.shape) == 1:
        X = np.expand_dims(X, 1)

    return self.gmm_.score_samples(X) * -1

sklego.mixture.gmm_classifier.GMMClassifier

Bases: BaseEstimator, ClassifierMixin

The GMMClassifier trains a Gaussian Mixture Model for each class in y on a dataset X. Once a density is trained for each class we can evaluate the likelihood scores to see which class is more likely.

All parameters of the model are an exact copy of the parameters in scikit-learn.

Note

All the parameters are an exact copy of those of sklearn.mixture.GaussianMixture.

Attributes:

Name Type Description
gmms_ dict[int, GaussianMixture]

A dictionary of Gaussian Mixture Models, one for each class.

classes_ np.ndarray of shape (n_classes,)

The classes seen during fit.

Source code in sklego/mixture/gmm_classifier.py
class GMMClassifier(BaseEstimator, ClassifierMixin):
    """The `GMMClassifier` trains a Gaussian Mixture Model for each class in `y` on a dataset `X`. Once a density is
    trained for each class we can evaluate the likelihood scores to see which class is more likely.

    All parameters of the model are an exact copy of the parameters in scikit-learn.

    !!! note
        All the parameters are an exact copy of those of
        [sklearn.mixture.GaussianMixture]( https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html).

    Attributes
    ----------
    gmms_ : dict[int, GaussianMixture]
        A dictionary of Gaussian Mixture Models, one for each class.
    classes_ : np.ndarray of shape (n_classes,)
        The classes seen during `fit`.
    """

    def __init__(
        self,
        n_components=1,
        covariance_type="full",
        tol=1e-3,
        reg_covar=1e-6,
        max_iter=100,
        n_init=1,
        init_params="kmeans",
        weights_init=None,
        means_init=None,
        precisions_init=None,
        random_state=None,
        warm_start=False,
        verbose=0,
        verbose_interval=10,
    ):
        self.n_components = n_components
        self.covariance_type = covariance_type
        self.tol = tol
        self.reg_covar = reg_covar
        self.max_iter = max_iter
        self.n_init = n_init
        self.init_params = init_params
        self.weights_init = weights_init
        self.means_init = means_init
        self.precisions_init = precisions_init
        self.random_state = random_state
        self.warm_start = warm_start
        self.verbose = verbose
        self.verbose_interval = verbose_interval

    def fit(self, X: np.ndarray, y: np.ndarray) -> "GMMClassifier":
        """Fit the `GMMClassifier` model using `X`, `y` as training data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features )
            The training data.
        y : array-like of shape (n_samples,)
            The target values.

        Returns
        -------
        self : GMMClassifier
            The fitted estimator.
        """
        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
        if X.ndim == 1:
            X = np.expand_dims(X, 1)

        self.gmms_ = {}
        self.classes_ = unique_labels(y)
        for c in self.classes_:
            subset_x, subset_y = X[y == c], y[y == c]
            mixture = GaussianMixture(
                n_components=self.n_components,
                covariance_type=self.covariance_type,
                tol=self.tol,
                reg_covar=self.reg_covar,
                max_iter=self.max_iter,
                n_init=self.n_init,
                init_params=self.init_params,
                weights_init=self.weights_init,
                means_init=self.means_init,
                precisions_init=self.precisions_init,
                random_state=self.random_state,
                warm_start=self.warm_start,
                verbose=self.verbose,
                verbose_interval=self.verbose_interval,
            )
            self.gmms_[c] = mixture.fit(subset_x, subset_y)
        return self

    def predict(self, X):
        """Predict labels for `X` using fitted estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data to predict.

        Returns
        -------
        array-like of shape (n_samples,)
            The predicted data.
        """
        check_is_fitted(self, ["gmms_", "classes_"])
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        return self.classes_[self.predict_proba(X).argmax(axis=1)]

    def predict_proba(self, X):
        """Predict probabilities for `X` using fitted estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data to predict.

        Returns
        -------
        array-like of shape (n_samples, n_classes)
            The predicted probabilities.
        """
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["gmms_", "classes_"])
        res = np.zeros((X.shape[0], self.classes_.shape[0]))
        for idx, c in enumerate(self.classes_):
            res[:, idx] = self.gmms_[c].score_samples(X)
        return softmax(res, axis=1)

fit(X, y)

Fit the GMMClassifier model using X, y as training data.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features )

The training data.

required
y array-like of shape (n_samples,)

The target values.

required

Returns:

Name Type Description
self GMMClassifier

The fitted estimator.

Source code in sklego/mixture/gmm_classifier.py
def fit(self, X: np.ndarray, y: np.ndarray) -> "GMMClassifier":
    """Fit the `GMMClassifier` model using `X`, `y` as training data.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features )
        The training data.
    y : array-like of shape (n_samples,)
        The target values.

    Returns
    -------
    self : GMMClassifier
        The fitted estimator.
    """
    X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
    if X.ndim == 1:
        X = np.expand_dims(X, 1)

    self.gmms_ = {}
    self.classes_ = unique_labels(y)
    for c in self.classes_:
        subset_x, subset_y = X[y == c], y[y == c]
        mixture = GaussianMixture(
            n_components=self.n_components,
            covariance_type=self.covariance_type,
            tol=self.tol,
            reg_covar=self.reg_covar,
            max_iter=self.max_iter,
            n_init=self.n_init,
            init_params=self.init_params,
            weights_init=self.weights_init,
            means_init=self.means_init,
            precisions_init=self.precisions_init,
            random_state=self.random_state,
            warm_start=self.warm_start,
            verbose=self.verbose,
            verbose_interval=self.verbose_interval,
        )
        self.gmms_[c] = mixture.fit(subset_x, subset_y)
    return self

predict(X)

Predict labels for X using fitted estimator.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features)

The data to predict.

required

Returns:

Type Description
array-like of shape (n_samples,)

The predicted data.

Source code in sklego/mixture/gmm_classifier.py
def predict(self, X):
    """Predict labels for `X` using fitted estimator.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        The data to predict.

    Returns
    -------
    array-like of shape (n_samples,)
        The predicted data.
    """
    check_is_fitted(self, ["gmms_", "classes_"])
    X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
    return self.classes_[self.predict_proba(X).argmax(axis=1)]

predict_proba(X)

Predict probabilities for X using fitted estimator.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features)

The data to predict.

required

Returns:

Type Description
array-like of shape (n_samples, n_classes)

The predicted probabilities.

Source code in sklego/mixture/gmm_classifier.py
def predict_proba(self, X):
    """Predict probabilities for `X` using fitted estimator.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        The data to predict.

    Returns
    -------
    array-like of shape (n_samples, n_classes)
        The predicted probabilities.
    """
    X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
    check_is_fitted(self, ["gmms_", "classes_"])
    res = np.zeros((X.shape[0], self.classes_.shape[0]))
    for idx, c in enumerate(self.classes_):
        res[:, idx] = self.gmms_[c].score_samples(X)
    return softmax(res, axis=1)

sklego.mixture.gmm_outlier_detector.GMMOutlierDetector

Bases: OutlierMixin, BaseEstimator

The GMMDetector trains a Gaussian Mixture model on a dataset X. Once a density is trained we can evaluate the likelihood scores to see if it is deemed likely.

By providing a threshold this model might then label outliers if their likelihood score is too low.

Note

The parameters other than threshold and method are an exact copy of the parameters in sklearn.mixture.GaussianMixture.

Parameters:

Name Type Description Default
threshold float

The limit at which the model thinks an outlier appears, must be between (0, 1).

0.99
method Literal[quantile, stddev]

The method to use to apply the threshold.

Info

If you select method="quantile" then the threshold value represents the quantile value to start calling something an outlier.

If you select method="stddev" then the threshold value represents the numbers of standard deviations before calling something an outlier.

"quantile"

Attributes:

Name Type Description
gmm_ GaussianMixture

The trained Gaussian Mixture model.

likelihood_threshold_ float

The threshold value used to determine if something is an outlier.

Source code in sklego/mixture/gmm_outlier_detector.py
class GMMOutlierDetector(OutlierMixin, BaseEstimator):
    """The `GMMDetector` trains a Gaussian Mixture model on a dataset `X`. Once a density is trained we can evaluate the
    likelihood scores to see if it is deemed likely.

    By providing a `threshold` this model might then label outliers if their likelihood score is too low.

    !!! note
        The parameters other than `threshold` and `method` are an exact copy of the parameters in
        [sklearn.mixture.GaussianMixture]( https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html).

    Parameters
    ----------
    threshold : float, default=0.99
        The limit at which the model thinks an outlier appears, must be between (0, 1).
    method : Literal["quantile", "stddev"], default="quantile"
        The method to use to apply the `threshold`.

        !!! info
            If you select `method="quantile"` then the threshold value represents the quantile value to start calling
            something an outlier.

            If you select `method="stddev"` then the threshold value represents the
            numbers of standard deviations before calling something an outlier.

    Attributes
    ----------
    gmm_ : GaussianMixture
        The trained Gaussian Mixture model.
    likelihood_threshold_ : float
        The threshold value used to determine if something is an outlier.
    """

    _ALLOWED_METHODS = ("quantile", "stddev")

    def __init__(
        self,
        threshold=0.99,
        method="quantile",
        n_components=1,
        covariance_type="full",
        tol=1e-3,
        reg_covar=1e-6,
        max_iter=100,
        n_init=1,
        init_params="kmeans",
        weights_init=None,
        means_init=None,
        precisions_init=None,
        random_state=None,
        warm_start=False,
        verbose=0,
        verbose_interval=10,
    ):
        self.threshold = threshold
        self.method = method
        self.random_state = random_state
        self.n_components = n_components
        self.covariance_type = covariance_type
        self.tol = tol
        self.reg_covar = reg_covar
        self.max_iter = max_iter
        self.n_init = n_init
        self.init_params = init_params
        self.weights_init = weights_init
        self.means_init = means_init
        self.precisions_init = precisions_init
        self.random_state = random_state
        self.warm_start = warm_start
        self.verbose = verbose
        self.verbose_interval = verbose_interval

    def fit(self, X: np.ndarray, y=None) -> "GMMOutlierDetector":
        """Fit the `BayesianGMMOutlierDetector` model using `X`, `y` as training data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features )
            The training data.
        y : array-like of shape (n_samples,)
            Ignored, present for compatibility.

        Returns
        -------
        self : GMMOutlierDetector
            The fitted estimator.

        Raises
        ------
        ValueError
            - If `method="quantile"` and `threshold` is not between (0, 1).
            - If `method="stddev"` and `threshold` is negative.
            - If `method` is not in `["quantile", "stddev"]`.
        """
        # GMM sometimes throws an error if you don't do this
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        if len(X.shape) == 1:
            X = np.expand_dims(X, 1)

        if (self.method == "quantile") and ((self.threshold > 1) or (self.threshold < 0)):
            raise ValueError(f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold < 1")
        if (self.method == "stddev") and (self.threshold < 0):
            raise ValueError(f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold ")
        if self.method not in self._ALLOWED_METHODS:
            raise ValueError(f"Method not recognised. Method must be in {self._ALLOWED_METHODS}")

        self.gmm_ = GaussianMixture(
            n_components=self.n_components,
            covariance_type=self.covariance_type,
            tol=self.tol,
            reg_covar=self.reg_covar,
            max_iter=self.max_iter,
            n_init=self.n_init,
            init_params=self.init_params,
            weights_init=self.weights_init,
            means_init=self.means_init,
            precisions_init=self.precisions_init,
            random_state=self.random_state,
            warm_start=self.warm_start,
            verbose=self.verbose,
            verbose_interval=self.verbose_interval,
        )
        self.gmm_.fit(X)
        score_samples = self.gmm_.score_samples(X)

        if self.method == "quantile":
            self.likelihood_threshold_ = np.quantile(score_samples, 1 - self.threshold)

        if self.method == "stddev":
            density = gaussian_kde(score_samples)
            max_x_value = minimize_scalar(lambda x: -density(x)).x
            mean_likelihood = score_samples.mean()
            new_likelihoods = score_samples[score_samples < max_x_value]
            new_likelihoods_std = np.std(new_likelihoods - mean_likelihood)
            self.likelihood_threshold_ = mean_likelihood - (self.threshold * new_likelihoods_std)

        return self

    def score_samples(self, X):
        """Compute the log likelihood for each sample and return the negative value."""
        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
        check_is_fitted(self, ["gmm_", "likelihood_threshold_"])
        if len(X.shape) == 1:
            X = np.expand_dims(X, 1)

        return -self.gmm_.score_samples(X)

    def decision_function(self, X):
        # We subtract self.offset_ to make 0 be the threshold value for being an outlier:
        return self.score_samples(X) + self.likelihood_threshold_

    def predict(self, X):
        """Predict if a point is an outlier or not using the fitted model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The data to predict.

        Returns
        -------
        array-like of shape (n_samples,)
            The predicted data. 1 for inliers, -1 for outliers.
        """
        predictions = (self.decision_function(X) >= 0).astype(int)
        predictions[predictions == 1] = -1
        predictions[predictions == 0] = 1
        return predictions

    @property
    def allowed_methods(self):
        warn(
            "Please use `_ALLOWED_METHODS` instead of `allowed_methods`,"
            "`allowed_methods` will be deprecated in future versions",
            DeprecationWarning,
        )
        return self._ALLOWED_METHODS

fit(X, y=None)

Fit the BayesianGMMOutlierDetector model using X, y as training data.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features )

The training data.

required
y array-like of shape (n_samples,)

Ignored, present for compatibility.

None

Returns:

Name Type Description
self GMMOutlierDetector

The fitted estimator.

Raises:

Type Description
ValueError
  • If method="quantile" and threshold is not between (0, 1).
  • If method="stddev" and threshold is negative.
  • If method is not in ["quantile", "stddev"].
Source code in sklego/mixture/gmm_outlier_detector.py
def fit(self, X: np.ndarray, y=None) -> "GMMOutlierDetector":
    """Fit the `BayesianGMMOutlierDetector` model using `X`, `y` as training data.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features )
        The training data.
    y : array-like of shape (n_samples,)
        Ignored, present for compatibility.

    Returns
    -------
    self : GMMOutlierDetector
        The fitted estimator.

    Raises
    ------
    ValueError
        - If `method="quantile"` and `threshold` is not between (0, 1).
        - If `method="stddev"` and `threshold` is negative.
        - If `method` is not in `["quantile", "stddev"]`.
    """
    # GMM sometimes throws an error if you don't do this
    X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
    if len(X.shape) == 1:
        X = np.expand_dims(X, 1)

    if (self.method == "quantile") and ((self.threshold > 1) or (self.threshold < 0)):
        raise ValueError(f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold < 1")
    if (self.method == "stddev") and (self.threshold < 0):
        raise ValueError(f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold ")
    if self.method not in self._ALLOWED_METHODS:
        raise ValueError(f"Method not recognised. Method must be in {self._ALLOWED_METHODS}")

    self.gmm_ = GaussianMixture(
        n_components=self.n_components,
        covariance_type=self.covariance_type,
        tol=self.tol,
        reg_covar=self.reg_covar,
        max_iter=self.max_iter,
        n_init=self.n_init,
        init_params=self.init_params,
        weights_init=self.weights_init,
        means_init=self.means_init,
        precisions_init=self.precisions_init,
        random_state=self.random_state,
        warm_start=self.warm_start,
        verbose=self.verbose,
        verbose_interval=self.verbose_interval,
    )
    self.gmm_.fit(X)
    score_samples = self.gmm_.score_samples(X)

    if self.method == "quantile":
        self.likelihood_threshold_ = np.quantile(score_samples, 1 - self.threshold)

    if self.method == "stddev":
        density = gaussian_kde(score_samples)
        max_x_value = minimize_scalar(lambda x: -density(x)).x
        mean_likelihood = score_samples.mean()
        new_likelihoods = score_samples[score_samples < max_x_value]
        new_likelihoods_std = np.std(new_likelihoods - mean_likelihood)
        self.likelihood_threshold_ = mean_likelihood - (self.threshold * new_likelihoods_std)

    return self

predict(X)

Predict if a point is an outlier or not using the fitted model.

Parameters:

Name Type Description Default
X array-like of shape (n_samples, n_features)

The data to predict.

required

Returns:

Type Description
array-like of shape (n_samples,)

The predicted data. 1 for inliers, -1 for outliers.

Source code in sklego/mixture/gmm_outlier_detector.py
def predict(self, X):
    """Predict if a point is an outlier or not using the fitted model.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        The data to predict.

    Returns
    -------
    array-like of shape (n_samples,)
        The predicted data. 1 for inliers, -1 for outliers.
    """
    predictions = (self.decision_function(X) >= 0).astype(int)
    predictions[predictions == 1] = -1
    predictions[predictions == 0] = 1
    return predictions

score_samples(X)

Compute the log likelihood for each sample and return the negative value.

Source code in sklego/mixture/gmm_outlier_detector.py
def score_samples(self, X):
    """Compute the log likelihood for each sample and return the negative value."""
    X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
    check_is_fitted(self, ["gmm_", "likelihood_threshold_"])
    if len(X.shape) == 1:
        X = np.expand_dims(X, 1)

    return -self.gmm_.score_samples(X)