Source code for fsfc.text.FTC

from math import log2, inf

import numpy as np

from fsfc.base import ClusteringFeatureSelector
from fsfc.utils import apriori


[docs]class FTC(ClusteringFeatureSelector):
    """
    Frequent Terms-based Clustering algorithm. Uses frequent termsets to find clusters and simultaneously
    select features which determine every cluster.

    Based on the article `"Frequent term-based text clustering" <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.12.7997&rep=rep1&type=pdf>`_.

    **FTS** is a set of terms that appear in some part of all samples in dataset. We will say that FTS
    *covers* sample if every term of FTS is contained in the sample.

    Algorithm does clustering in the following way:
        1. Find all FTS for dataset with specified *minsup*. Elements of FTS are terms, i.e. features of dataset.
        2. Find FTS that has the lowest Entropy Overlap with the rest clusters with respect to dataset.
           It's shown in the paper that such FTS will explain data the best.
        3. Add this FTS to clustering, remove from dataset samples covered by it, repeat steps 2 and 3.
        4. Assign clusters to samples. Sample belongs to a cluster defined by FTS if FTS covers sample.
        5. Scores of features are 1 if feature belongs to any FTS and 0 otherwise.

    Parameters
    ----------
    minsup: float
        Part of the dataset which should be covered by each FTS.
    """

    def __init__(self, minsup):
        super().__init__(-1)
        self.minsup = minsup

[docs]    def fit(self, x, *rest):
        """
        Fit algorithm to dataset, find clusters and select relevant features.

        Parameters
        ----------
        x: csr_matrix
            SciPy Sparse Matrix representing terms contained in every sample. May be created by vectorizers from sklearn.

        Returns
        -------
        self: FTC
            Returns itself to support chaining.
        """
        return super().fit(x, *rest)

    def _calc_scores_and_labels(self, x):
        n_samples, n_features = x.get_shape()
        # Construct dataset for frequent itemsets extractions
        rows, columns = x.nonzero()
        dataset = [[] for _ in range(n_samples)]
        for (i, j) in zip(rows, columns):
            if x[i, j] != 0:
                # noinspection PyTypeChecker
                dataset[i].append(int(j))
        remaining = apriori(dataset, self.minsup)
        if len(FTC._calculate_coverage(remaining, dataset)) != n_samples:
            # Dataset can't be fully clustered using this method
            raise ValueError(
                'Can\'t cluster a dataset using FTC method - it isn\'t covered by the chosen itemset '
                + str(remaining)
            )
        dataset_copy = [sample for sample in dataset]
        result = []

        # Until all samples are covered
        while len(dataset) != 0:
            min_overlap = None
            best_cluster = None
            # Find feature set with the smallest overlap with rest
            for itemset in remaining:
                overlap = FTC._calculate_overlap(itemset, remaining, dataset)
                if min_overlap is None or overlap < min_overlap:
                    min_overlap = overlap
                    best_cluster = itemset
            # Add it to result
            result.append(best_cluster)
            remaining.remove(best_cluster)

            # Remove all covered samples from dataset
            for sample in FTC._calculate_coverage([best_cluster], dataset):
                dataset.remove(sample)

        # Select features used in final clustering, assign labels according to chosen itemsets
        used_features = {f for itemset in result for f in itemset}
        scores = [1 if i in used_features else 0 for i in range(n_features)]
        labels = [None for _ in range(n_samples)]
        for i in range(n_samples):
            features = dataset_copy[i]
            for cluster_idx in range(len(result)):
                if all(c in features for c in result[cluster_idx]):
                    labels[i] = cluster_idx
                    break
        self.k = len(used_features)

        return np.array(scores), np.array(labels)

    @staticmethod
    def _calculate_coverage(clusters, dataset):
        """
        Finds samples of dataset covered by clusters

        Parameters
        ----------
        clusters: list
            List of FTS defining clusters
        dataset: list
            List of sets defining samples

        Returns
        -------
        covered: list
            List of samples covered by clusters
        """
        # Find samples covered by specified clusters
        covered = []
        for sample in dataset:
            for cluster in clusters:
                if all(c in sample for c in cluster):
                    covered.append(sample)
                    break
        return covered

    @staticmethod
    def _calculate_overlap(cluster, rest, dataset):
        """
        Calculates Entropy Overlap of cluster with rest FTS

        Parameters
        ----------
        cluster: set
            Termset defining cluster
        rest: list
            List of other termsets
        dataset: list
            List of sets defining samples

        Returns
        -------
        overlap: float
            Value of overlap of cluster with the rest of clusters
        """
        f = [0 for _ in dataset]
        # Calculate how much clusters from `rest` cover each sample
        for i in range(len(dataset)):
            sample = dataset[i]
            for cur_cluster in rest:
                if all(c in sample for c in cur_cluster):
                    f[i] += 1
        overlap = inf
        # Compute entropy-overlap for cluster
        for i in range(len(dataset)):
            sample = dataset[i]
            if all(c in sample for c in cluster):
                if overlap == inf:
                    overlap = 0
                overlap += -1.0 / f[i] * log2(1.0 / f[i])
        return overlap