Source code for fsfc.generic.MCFS

import numpy as np
import math
from sklearn.neighbors import kneighbors_graph
from sklearn.linear_model import LassoLars, Lars
from scipy.linalg import eigh
from fsfc.base import KBestFeatureSelector


[docs]class MCFS(KBestFeatureSelector):
    """
    Multi-Class Feature selection algorithm.

    Uses k-NN graph of samples in dataset and Spectral Graph Theory to find the most explaining features.

    Based on the article `"Unsupervised feature selection for multi-cluster data." <https://dl.acm.org/citation.cfm?id=1835848>`_.

    Algorithm selects features in the following way:

        1. Computes k-NN graph for the dataset.
        2. Computes heat matrix for this graph, degree and laplacian matrix.
        3. Solves eigen-problem: L y = `lambda` D y, selects K smallest eigen-values and corresponding vectors.
        4. Solves K regression problems, trying to predict every eigenvector by regression using dataset.
        5. Computes score of each feature using found regression coefficients.
        6. Select k features with the top scores.

    Parameters
    ----------
    k: int
        Number of features to select.
    clusters: int
        Expected number of datasets.
    p: int (default 8)
        Number of nearest neighbours for construction of k-NN graph.
    sigma: int (default 1)
        Coefficient for computation of heat matrix.
    mode: 'default' or 'lasso' (default 'default')
        Type of penalty for the method: with 'default' algorithm uses no penalty, with 'lasso' it uses L1-penalty.
    alpha: float (default 0.01)
        Importance of penalty for algorithm with **mode='lasso'**.
    """

    def __init__(self, k, clusters, p=8, sigma=1, mode='default', alpha=0.01):
        super().__init__(k)
        self.clusters = clusters
        self.p = p
        self.mode = mode
        self.sigma = sigma
        self.alpha = alpha

    def _create_regressor(self):
        if self.mode == 'default':
            return Lars()
        if self.mode == 'lasso':
            return LassoLars(alpha=self.alpha)
        raise ValueError('Unexpected mode ' + self.mode + '. Expected "default" or "lasso"')

    def _calc_scores(self, x):
        graph = kneighbors_graph(
            x,
            n_neighbors=self.p,
        )
        # Construct the heat matrix
        w = np.zeros([x.shape[0], x.shape[0]])
        rows, cols = graph.nonzero()
        for i, j in zip(rows, cols):
            w[i, j] = math.exp(-np.linalg.norm(x[i] - x[j])**2/self.sigma)

        # Compute degree and Laplacian matrices
        degree_vector = np.sum(w, 1)
        degree = np.diag(degree_vector)
        laplacian = degree - w

        # Solve the eigen-problem
        values, vectors = eigh(laplacian, degree)
        smallest = vectors[:, 0:self.clusters].T

        # Find coefficients for each cluster
        coefs = []
        for i in range(self.clusters):
            this_coefs = self._create_regressor().fit(x, smallest[i]).coef_
            coefs.append(this_coefs)
        coefs = np.array(coefs)

        # Compute MCFS-scores
        scores = np.max(coefs, 0)
        return scores