Source code for fsfc.text.CHIR

import numpy as np
from fsfc.base import KBestFeatureSelector
from sklearn.cluster import KMeans


[docs]class CHIR(KBestFeatureSelector):
    """
    Chi-R feature selection algorithm for text clustering.

    Uses Chi-square statistics to evaluate the importance of each feature and R-coefficient
    that normalises statistics features across the corpus.

    Based on the article
    `"Text clustering with feature selection by using statistical data." <https://ieeexplore.ieee.org/document/4408578/>`_.

    Algorithm selects features in the following way:
        1. Find initial clustering of dataset.
        2. Compute Chi-R scores for each feature according to the article.
        3. Select top *k* features according to scores.
        4. Set weights for top features equal to 1 and for others to alpha.
        5. Recompute clustering. If it changes, repeat steps 2-5, otherwise return weights of features.

    Parameters
    ----------
    k: int
        Number of features to select.
    clusters: int
        Expected number of clusters.
    alpha: float (default 0.1)
        Value of weight of irrelevant feature.
    max_iter: int (default 1000)
        Maximal number of iterations of the algorithm.

    """
    def __init__(self, k, clusters, alpha=0.1, max_iter=1000):
        super().__init__(k)
        self.clusters = clusters
        self.alpha = alpha
        self.max_iter = max_iter

[docs]    def fit(self, x, *rest):
        """
        Fit algorithm to dataset and select relevant features.

        Parameters
        ----------
        x: csr_matrix
            SciPy Sparse Matrix representing terms contained in every sample. May be created by vectorizers from
            sklearn. Preferred choice is TF-IDF vectorizer.

        Returns
        -------
        self: FTC
            Returns itself to support chaining.
        """
        return super().fit(x, *rest)

    def _calc_scores(self, x):
        n_samples, n_features = x.shape

        k_means = KMeans(n_clusters=self.clusters)
        clusters = k_means.fit_predict(x)
        weight = np.ones([n_features])
        for i in range(self.max_iter):
            scores = self._compute_chir_scores(x, clusters)
            sorted_args = np.argsort(scores)

            # Weight for top scores should be equal to 1, for others to alpha
            new_weight = np.zeros_like(weight)
            new_weight[sorted_args[0:n_features - self.k]] = self.alpha
            new_weight[sorted_args[-self.k:]] = 1

            # Recompute clusters
            new_clusters = k_means.fit_predict(x.multiply(new_weight))
            if np.equal(weight, new_weight).all():
                break
            clusters = new_clusters
            weight = new_weight
        return weight

    @staticmethod
    def _compute_chir_scores(x, labels):
        # Compute matrix W where w[i, j] = amount of occurrences of j-th word in i-th cluster
        n_samples, n_features = x.get_shape()
        n_clusters = np.max(labels)
        w = np.zeros([n_clusters, n_features])
        rows, cols = x.nonzero()
        for (sample, word) in zip(rows, cols):
            cluster = labels[sample] - 1
            w[cluster, word] += 1
        row_sum = np.sum(w, 1)  # Number of words in i-th cluster
        column_sum = np.sum(w, 0)  # Number of clusters that contain j-th word
        w_sum = np.sum(row_sum)  # Total number of words

        # Compute Chi-R statistics using this matrix
        scores = np.zeros([n_features])
        for j in range(n_features):
            chi = np.zeros([n_clusters])  # Chi-square statistics
            r = np.zeros([n_clusters])  # R coefficient
            for i in range(n_clusters):
                # Contingency matrix
                cont = np.array([
                    [w[i, j], row_sum[i] - w[i, j]],
                    [column_sum[j] - w[i, j], w_sum - column_sum[j] - row_sum[i] + w[i, j]]
                ])

                # Calculation of expected frequency
                def calc_e(wrd, c):
                    return (cont[wrd, 0] + cont[wrd, 1]) * (cont[0, c] + cont[1, c]) / w_sum

                # Compute R coefficient for cluster and Chi-square statistics
                r[i] = w[i, j] / calc_e(0, 0)
                for a in [0, 1]:
                    for b in [0, 1]:
                        expected = calc_e(a, b)
                        chi[i] += (cont[a, b] - expected) ** 2 / expected
            # Select clusters with positive dependence with this term
            dependent = r > 1
            r_dependent = r[dependent]
            chi_dependent = chi[dependent]
            # Compute score
            scores[j] = (r_dependent / np.sum(r_dependent)).dot(chi_dependent)
        return scores