Source code for fsfc.generic.LFSBSS

import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import euclidean_distances

from fsfc.base import BaseEstimator, ClusterMixin


[docs]class LFSBSS(BaseEstimator, ClusterMixin):
    """
    Localised Feature Selection, Based on Scattered Separability.

    Selects features and simultaneously builds clustering in an iterative way.
    Every cluster has it's local set of selected features, and we project input data
    to a subspace defined by it to predict cluster of a point.

    This implementation doesn't take into account importance of overlay of clusters and unassigned points
    for the sake of performance.

    Based on the article `"Localized feature selection for clustering." <http://www.cs.wayne.edu/~jinghua/publication/PRL-LocalizedFeatureSelection.pdf>`_.

    Algorithm builds clustering in the following way:
        1. Find initial clustering using k-Means algorithm
        2. For each cluster:
            1. Find feature which can be dropped to improve the Scatter Separability Score
            2. Recompute clusters without this feature
            3. Find cluster that is the most similar to the current one
            4. Compute normalized value of Scatter Separability Score for two clusters - current and new
            5. If scores were improved, drop the feature and update clustering
        3. Repeat step 2 until no changes were made
        4. Return found clusters

    Algorithm predicts clusters for new points in the following way:
        1. Project points to the feature subspace of each cluster
        2. Find the cluster whose center is the closest to projected point

    Parameters
    ----------
    clusters: int
        Number of clusters to find
    max_iterations: int (default 100)
        Maximal number of iterations of the algorithm
    """

    def __init__(self, clusters, max_iterations=100):
        self.clusters = clusters
        self.labels_ = None
        self.features_ = None
        self.means_ = None
        self.vars_ = None
        self.max_iterations = max_iterations

[docs]    def fit(self, x):
        """
        Fit algorithm to dataset, find clusters and set of features for every cluster

        Parameters
        ----------
        x: ndarray
            The dataset

        Returns
        -------
        self: LFSBSS
            Returns itself to support chaining
        """

        n_samples, n_features = x.shape
        # Build initial clustering
        k_means = KMeans(n_clusters=self.clusters)
        features = [set(range(n_features)) for _ in range(self.clusters)]
        clusters, means = self._find_clusters_and_means(k_means, x)

        for it in range(self.max_iterations):
            was_changed = False
            for i in range(self.clusters):
                cluster = clusters[i]
                mean = means[i]
                this_features = features[i]
                if len(this_features) == 1:
                    # Can't drop anything
                    continue

                # Find a feature which we can drop and have the highest scatter separability score
                max_score = None
                new_features = None
                for feature in this_features:
                    without_feature = list(this_features - {feature})
                    score = self._compute_score(x, means, cluster, mean, without_feature)
                    if max_score is None or score > max_score:
                        max_score = score
                        new_features = without_feature
                if new_features is None:
                    # Nothing to remove from this cluster
                    continue

                # Repartition dataset using new features, find a cluster that is the most similar to a current one
                new_x = x[:, new_features]
                new_clusters, new_means = self._find_clusters_and_means(k_means, new_x)

                # Use Jaccard difference as the measure of similarity
                max_score = None
                most_similar = None
                new_mean = None
                for j in range(self.clusters):
                    new_cluster = clusters[j]
                    score = LFSBSS._jaccard_score(new_cluster, cluster)
                    if max_score is None or score > max_score:
                        max_score = score
                        most_similar = new_cluster
                        new_mean = new_means[i]
                if most_similar is None or new_mean is None:
                    # Nothing to select
                    continue

                # Compute normalized value of scatter separability
                nv_old = self._compute_score(x, means, cluster, mean, list(this_features)) * \
                         self._compute_score(x, means, cluster, mean, new_features)
                nv_new = self._compute_score(x, means, most_similar, mean, list(this_features)) * \
                         self._compute_score(new_x, new_means, most_similar, new_mean, range(len(new_features)))

                if nv_new >= nv_old:
                    # It's better to drop this feature
                    was_changed = True
                    features[i] = set(new_features)
                    clusters[i] = most_similar
                    # means[i] = new_mean
            if not was_changed:
                break

        self.features_ = features
        self.means_ = means
        self.vars_ = [None] * x.shape[0]

        # Compute variances for clusters
        for (idx, cluster) in enumerate(clusters):
            feature = np.array(list(features[idx]))[:, np.newaxis]
            self.vars_[idx] = np.var(x[cluster, feature])

        self.labels_ = [None] * x.shape[0]
        for (idx, cluster) in enumerate(clusters):
            for sample in cluster:
                self.labels_[sample] = idx
        for i in range(x.shape[0]):
            if self.labels_[i] is None:
                self.labels_[i] = self.predict(x[i])
        return self

[docs]    def predict(self, x):
        """
        Predict clusters for one sample

        Parameters
        ----------
        x: ndarray
            Samples to predict

        Returns
        -------
        label: int
            Predicted cluster
        """

        # Find the closest cluster to samples
        # To do it, project x to appropriate subspace, find distance to mean value and norm by variance
        min_score = None
        closest = None
        for i in range(self.clusters):
            projection = x[:, self.features_[i]]
            norm = euclidean_distances(projection, self.means_[i])
            score = norm / self.vars_[i]
            if min_score is None or score < min_score:
                min_score = score
                closest = i
        return closest

    def _find_clusters_and_means(self, k_means, x):
        initial = k_means.fit_predict(x)
        clusters = [[] for _ in range(self.clusters)]
        means = k_means.cluster_centers_.copy()
        for (idx, c) in enumerate(initial):
            clusters[c].append(idx)
        return clusters, means

    @staticmethod
    def _compute_score(x, means, cluster, mean, features):
        cluster = np.array(cluster)
        features = np.array(features)
        means = means[:, features]
        x = x[:, features]
        mean = mean[features]

        total_mean = np.mean(x, axis=0)
        cluster_values = x[cluster, :]
        cluster_diff = cluster_values - mean
        in_cluster = LFSBSS._tensor_product_sum(cluster_diff)

        means_diff = means - total_mean
        between_cluster = LFSBSS._tensor_product_sum(means_diff)

        try:
            separability = np.trace(np.linalg.inv(in_cluster).dot(between_cluster))
        except np.linalg.LinAlgError as _:
            separability = np.trace(between_cluster) / np.trace(in_cluster)
        return separability / cluster_values.shape[0]

    @staticmethod
    def _tensor_product_sum(x):
        res = np.zeros([x.shape[1], x.shape[1]])
        for i in range(x.shape[0]):
            res = np.add(res, np.tensordot(x[i], x[i], axes=0))
        return res

    @staticmethod
    def _jaccard_score(a, b):
        a = set(a)
        b = set(b)
        return 1.0 * len(a.intersection(b)) / len(a.union(b))