Source code for fsfc.text.FTC

from math import log2, inf

import numpy as np

from fsfc.base import ClusteringFeatureSelector
from fsfc.utils import apriori


[docs]class FTC(ClusteringFeatureSelector): """ Frequent Terms-based Clustering algorithm. Uses frequent termsets to find clusters and simultaneously select features which determine every cluster. Based on the article `"Frequent term-based text clustering" <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.12.7997&rep=rep1&type=pdf>`_. **FTS** is a set of terms that appear in some part of all samples in dataset. We will say that FTS *covers* sample if every term of FTS is contained in the sample. Algorithm does clustering in the following way: 1. Find all FTS for dataset with specified *minsup*. Elements of FTS are terms, i.e. features of dataset. 2. Find FTS that has the lowest Entropy Overlap with the rest clusters with respect to dataset. It's shown in the paper that such FTS will explain data the best. 3. Add this FTS to clustering, remove from dataset samples covered by it, repeat steps 2 and 3. 4. Assign clusters to samples. Sample belongs to a cluster defined by FTS if FTS covers sample. 5. Scores of features are 1 if feature belongs to any FTS and 0 otherwise. Parameters ---------- minsup: float Part of the dataset which should be covered by each FTS. """ def __init__(self, minsup): super().__init__(-1) self.minsup = minsup
[docs] def fit(self, x, *rest): """ Fit algorithm to dataset, find clusters and select relevant features. Parameters ---------- x: csr_matrix SciPy Sparse Matrix representing terms contained in every sample. May be created by vectorizers from sklearn. Returns ------- self: FTC Returns itself to support chaining. """ return super().fit(x, *rest)
def _calc_scores_and_labels(self, x): n_samples, n_features = x.get_shape() # Construct dataset for frequent itemsets extractions rows, columns = x.nonzero() dataset = [[] for _ in range(n_samples)] for (i, j) in zip(rows, columns): if x[i, j] != 0: # noinspection PyTypeChecker dataset[i].append(int(j)) remaining = apriori(dataset, self.minsup) if len(FTC._calculate_coverage(remaining, dataset)) != n_samples: # Dataset can't be fully clustered using this method raise ValueError( 'Can\'t cluster a dataset using FTC method - it isn\'t covered by the chosen itemset ' + str(remaining) ) dataset_copy = [sample for sample in dataset] result = [] # Until all samples are covered while len(dataset) != 0: min_overlap = None best_cluster = None # Find feature set with the smallest overlap with rest for itemset in remaining: overlap = FTC._calculate_overlap(itemset, remaining, dataset) if min_overlap is None or overlap < min_overlap: min_overlap = overlap best_cluster = itemset # Add it to result result.append(best_cluster) remaining.remove(best_cluster) # Remove all covered samples from dataset for sample in FTC._calculate_coverage([best_cluster], dataset): dataset.remove(sample) # Select features used in final clustering, assign labels according to chosen itemsets used_features = {f for itemset in result for f in itemset} scores = [1 if i in used_features else 0 for i in range(n_features)] labels = [None for _ in range(n_samples)] for i in range(n_samples): features = dataset_copy[i] for cluster_idx in range(len(result)): if all(c in features for c in result[cluster_idx]): labels[i] = cluster_idx break self.k = len(used_features) return np.array(scores), np.array(labels) @staticmethod def _calculate_coverage(clusters, dataset): """ Finds samples of dataset covered by clusters Parameters ---------- clusters: list List of FTS defining clusters dataset: list List of sets defining samples Returns ------- covered: list List of samples covered by clusters """ # Find samples covered by specified clusters covered = [] for sample in dataset: for cluster in clusters: if all(c in sample for c in cluster): covered.append(sample) break return covered @staticmethod def _calculate_overlap(cluster, rest, dataset): """ Calculates Entropy Overlap of cluster with rest FTS Parameters ---------- cluster: set Termset defining cluster rest: list List of other termsets dataset: list List of sets defining samples Returns ------- overlap: float Value of overlap of cluster with the rest of clusters """ f = [0 for _ in dataset] # Calculate how much clusters from `rest` cover each sample for i in range(len(dataset)): sample = dataset[i] for cur_cluster in rest: if all(c in sample for c in cur_cluster): f[i] += 1 overlap = inf # Compute entropy-overlap for cluster for i in range(len(dataset)): sample = dataset[i] if all(c in sample for c in cluster): if overlap == inf: overlap = 0 overlap += -1.0 / f[i] * log2(1.0 / f[i]) return overlap