Source code for fsfc.generic.MCFS

import numpy as np
import math
from sklearn.neighbors import kneighbors_graph
from sklearn.linear_model import LassoLars, Lars
from scipy.linalg import eigh
from fsfc.base import KBestFeatureSelector

[docs]class MCFS(KBestFeatureSelector): """ Multi-Class Feature selection algorithm. Uses k-NN graph of samples in dataset and Spectral Graph Theory to find the most explaining features. Based on the article `"Unsupervised feature selection for multi-cluster data." <>`_. Algorithm selects features in the following way: 1. Computes k-NN graph for the dataset. 2. Computes heat matrix for this graph, degree and laplacian matrix. 3. Solves eigen-problem: L y = `lambda` D y, selects K smallest eigen-values and corresponding vectors. 4. Solves K regression problems, trying to predict every eigenvector by regression using dataset. 5. Computes score of each feature using found regression coefficients. 6. Select k features with the top scores. Parameters ---------- k: int Number of features to select. clusters: int Expected number of datasets. p: int (default 8) Number of nearest neighbours for construction of k-NN graph. sigma: int (default 1) Coefficient for computation of heat matrix. mode: 'default' or 'lasso' (default 'default') Type of penalty for the method: with 'default' algorithm uses no penalty, with 'lasso' it uses L1-penalty. alpha: float (default 0.01) Importance of penalty for algorithm with **mode='lasso'**. """ def __init__(self, k, clusters, p=8, sigma=1, mode='default', alpha=0.01): super().__init__(k) self.clusters = clusters self.p = p self.mode = mode self.sigma = sigma self.alpha = alpha def _create_regressor(self): if self.mode == 'default': return Lars() if self.mode == 'lasso': return LassoLars(alpha=self.alpha) raise ValueError('Unexpected mode ' + self.mode + '. Expected "default" or "lasso"') def _calc_scores(self, x): graph = kneighbors_graph( x, n_neighbors=self.p, ) # Construct the heat matrix w = np.zeros([x.shape[0], x.shape[0]]) rows, cols = graph.nonzero() for i, j in zip(rows, cols): w[i, j] = math.exp(-np.linalg.norm(x[i] - x[j])**2/self.sigma) # Compute degree and Laplacian matrices degree_vector = np.sum(w, 1) degree = np.diag(degree_vector) laplacian = degree - w # Solve the eigen-problem values, vectors = eigh(laplacian, degree) smallest = vectors[:, 0:self.clusters].T # Find coefficients for each cluster coefs = [] for i in range(self.clusters): this_coefs = self._create_regressor().fit(x, smallest[i]).coef_ coefs.append(this_coefs) coefs = np.array(coefs) # Compute MCFS-scores scores = np.max(coefs, 0) return scores