Source code for fsfc.generic.SPEC

from abc import abstractmethod

import numpy as np
from sklearn.metrics.pairwise import rbf_kernel, cosine_similarity

from fsfc.base import KBestFeatureSelector


[docs]class SPEC(KBestFeatureSelector):
    """
    SPEC-family of feature selection algorithms.

    Every algorithm of this family creates graph representation of distribution of samples in a high-dimensional space.
    Samples becomes vertices and RBF-distance between them becomes weight of the edge. Algorithms use Spectral Graph
    Theory to calculate features scores.

    Based on the article `"Spectral feature selection for supervised and unsupervised learning." <http://www.public.asu.edu/~huanliu/papers/icml07.pdf>`_.
    """

    @abstractmethod
    def _calc_spec_scores(self, degree, laplacian, normalised_features, normaliser):
        """
        Calculate SPEC scores for the method

        Parameters
        ----------
        degree: ndarray
            Degree matrix of the graph
        laplacian: ndarray
            Laplacian of the graph
        normalised_features: ndarray
            Feature vectors, normalised by normaliser
        normaliser: ndarray
            Vector computed by the degree matrix and used for normalisation of feature vectors and laplacian

        Returns
        -------
        scores: ndarray
            Scores of features
        """
        pass

    def _calc_scores(self, x):
        similarity = rbf_kernel(x)
        adjacency = similarity
        degree_vector = np.sum(adjacency, 1)
        degree = np.diag(degree_vector)
        laplacian = degree - adjacency
        normaliser_vector = np.reciprocal(np.sqrt(degree_vector))
        normaliser = np.diag(normaliser_vector)

        normalised_laplacian = normaliser.dot(laplacian).dot(normaliser)

        weighted_features = np.matmul(normaliser, x)

        normalised_features = weighted_features / np.linalg.norm(weighted_features, axis=0)
        return self._calc_spec_scores(degree, normalised_laplacian, normalised_features, normaliser)


[docs]class NormalizedCut(SPEC):
    """
    Feature selection algorithm that represents samples as vertices of graph.
    Weights of edges of this graph are equal to RBF-distances between points.
    Algorithm attempts to find minimal cut in this graph.

    Algorithm selects k features which were `separated` by this cut as they are considered to be better for explanation
    of the dataset.

    Parameters
    ----------
    k: int
        Number of features to select
    """

    def _calc_spec_scores(self, degree, laplacian, normalised_features, normaliser):
        all_to_all = normalised_features.transpose().dot(laplacian).dot(normalised_features)
        return np.diag(all_to_all)


[docs]class GenericSPEC(NormalizedCut):
    """
    Feature selection algorithm that represents samples as vertices of graph.
    Weights of edges of this graph are equal to RBF-distances between points.

    Algorithm uses Spectral Graph Theory to find features with the best separability.

    To do this, algorithm finds the trivial eugenvector of the Laplacian of the graph and uses it to
    normalise scores computed using :class:`NormalizedCut`. Such normalisation helps to improve
    accuracy of feature selection according to the article SPEC family is based on.

    Algorithm selects top k features according to this scores

    Parameters
    ----------
    k: int
        Number of features to select
    """

    def _calc_spec_scores(self, degree, laplacian, normalised_features, normaliser):
        normalised_cut = super()._calc_spec_scores(
            degree, laplacian, normalised_features, normaliser
        )
        trivial_eugenvector = normaliser.dot(np.ones([normalised_features.shape[0], 1]))
        norm = 1 - normalised_features.transpose().dot(trivial_eugenvector).squeeze().transpose()

        return normalised_cut / norm


[docs]class FixedSPEC(SPEC):
    """
    Feature selection algorithm that represents samples as vertices of graph.
    Weights of edges of this graph are equal to RBF-distances between points.

    Algorithm uses Spectral Graph Theory to find features with the best separability in assumption that
    points are separated to the predefined number of clusters

    To do this, algorithm finds eigenvectors corresponding to K smallest eigenvalues of the Laplacian of the graph,
    except the trivial one, and uses cosine distance between them and feature vectors to detect the most explaining
    features. Algorithm selects k features using this score.

    Parameters
    ----------
    k: int
        Number of features to select
    clusters: int
        Expected number of clusters
    """
    def __init__(self, k, clusters):
        super().__init__(k)
        self.clusters = clusters

    def _calc_spec_scores(self, degree, laplacian, normalised_features, normaliser):
        values, vectors = np.linalg.eigh(laplacian)

        result = np.zeros([normalised_features.shape[1]])
        for k in range(1, self.clusters):
            eigenvalue = values[k]
            eigenvector = vectors[:, k]
            cosines = np.zeros_like(result)
            for i in range(normalised_features.shape[1]):
                feature = normalised_features[:, i]
                similarity = cosine_similarity([eigenvector], [feature])[0]
                cosines[i] = similarity * similarity
            result += (2 - eigenvalue) * cosines
        return -result