Source code for fsfc.generic.Lasso

import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from scipy.optimize import fixed_point

from fsfc.base import ClusteringFeatureSelector


[docs]class Lasso(ClusteringFeatureSelector): """ Feature selection and clustering algorithm exploting the idea of L1-norm Simultaneously does clustering and computes "importance" of each feature for it Based on the article `"A framework for feature selection in clustering." <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2930825/>`_ Algorithm does clustering and selects features in the following way: 1. Assigns equal weights to every feature 2. Finds clusters according to weights of features 3. Computes objective of the method 4. Optimizes L1-penalty for found objective 5. Computes new feature weights using objective 6. If new weights equal to the old ones, break. Otherwise repeat steps 2-6 7. Select top k features according to weights Parameters ---------- k: int Number of features to select norm_constraint: float Constraint of L1-norm eps: float (default 1e-4) Precision of the algorithm max_iterations: int (default 100) Maximal number of iterations of algorithm """ def __init__(self, k, norm_constraint, eps=1e-4, max_iterations=100): super().__init__(k) self.norm_constraint = norm_constraint self.max_iterations = max_iterations self.eps = eps def _calc_scores_and_labels(self, x): # Assign equal weights to features feature_weights = np.full([1, x.shape[1]], 1/np.sqrt(x.shape[1])) labels = None for it in range(self.max_iterations): # Find clusters in the feature space, normalised by weights weighted_samples = feature_weights * x labels = KMeans().fit_predict(weighted_samples) # Compute objective vector of the method objective = Lasso._calc_objective_vector(x, labels) # Find parameter of threshold that optimizes L1-norm root = fixed_point( lambda delta: self._function_to_optimize(objective, delta), 0 ) # Compute new feature weights new_weights = Lasso._calc_new_feature_weights(objective, root) if np.linalg.norm(new_weights - feature_weights) < self.eps: break feature_weights = new_weights return feature_weights[0], labels @staticmethod def _calc_objective_vector(x, labels): clusters = {} for i, label in enumerate(labels): if label not in clusters: clusters[label] = [] clusters[label].append(i) result = np.zeros([1, x.shape[1]]) for i in range(x.shape[1]): feature = 0 samples = x[:, i].T.reshape([x.shape[0], 1]) for label, cluster in clusters.items(): size = len(cluster) cluster_samples = samples[cluster] distances = euclidean_distances(cluster_samples) feature += np.sum(distances) / size result[0, i] = np.sum(euclidean_distances(samples)) / x.shape[0] - feature return result def _function_to_optimize(self, objective, delta): weights = Lasso._calc_new_feature_weights(objective, delta) return np.linalg.norm(weights, 1) - self.norm_constraint @staticmethod def _soft_threshold(x, delta): positive = np.clip(x, a_min=0, a_max=None) return np.sign(positive) * np.clip(np.abs(positive) - delta, a_min=0, a_max=None) @staticmethod def _calc_new_feature_weights(objective, delta): soft = Lasso._soft_threshold(objective, delta) normed = soft / np.linalg.norm(soft) return normed