import numpy as np
from fsfc.base import KBestFeatureSelector
from sklearn.cluster import KMeans
[docs]class CHIR(KBestFeatureSelector):
"""
Chi-R feature selection algorithm for text clustering.
Uses Chi-square statistics to evaluate the importance of each feature and R-coefficient
that normalises statistics features across the corpus.
Based on the article
`"Text clustering with feature selection by using statistical data." <https://ieeexplore.ieee.org/document/4408578/>`_.
Algorithm selects features in the following way:
1. Find initial clustering of dataset.
2. Compute Chi-R scores for each feature according to the article.
3. Select top *k* features according to scores.
4. Set weights for top features equal to 1 and for others to alpha.
5. Recompute clustering. If it changes, repeat steps 2-5, otherwise return weights of features.
Parameters
----------
k: int
Number of features to select.
clusters: int
Expected number of clusters.
alpha: float (default 0.1)
Value of weight of irrelevant feature.
max_iter: int (default 1000)
Maximal number of iterations of the algorithm.
"""
def __init__(self, k, clusters, alpha=0.1, max_iter=1000):
super().__init__(k)
self.clusters = clusters
self.alpha = alpha
self.max_iter = max_iter
[docs] def fit(self, x, *rest):
"""
Fit algorithm to dataset and select relevant features.
Parameters
----------
x: csr_matrix
SciPy Sparse Matrix representing terms contained in every sample. May be created by vectorizers from
sklearn. Preferred choice is TF-IDF vectorizer.
Returns
-------
self: FTC
Returns itself to support chaining.
"""
return super().fit(x, *rest)
def _calc_scores(self, x):
n_samples, n_features = x.shape
k_means = KMeans(n_clusters=self.clusters)
clusters = k_means.fit_predict(x)
weight = np.ones([n_features])
for i in range(self.max_iter):
scores = self._compute_chir_scores(x, clusters)
sorted_args = np.argsort(scores)
# Weight for top scores should be equal to 1, for others to alpha
new_weight = np.zeros_like(weight)
new_weight[sorted_args[0:n_features - self.k]] = self.alpha
new_weight[sorted_args[-self.k:]] = 1
# Recompute clusters
new_clusters = k_means.fit_predict(x.multiply(new_weight))
if np.equal(weight, new_weight).all():
break
clusters = new_clusters
weight = new_weight
return weight
@staticmethod
def _compute_chir_scores(x, labels):
# Compute matrix W where w[i, j] = amount of occurrences of j-th word in i-th cluster
n_samples, n_features = x.get_shape()
n_clusters = np.max(labels)
w = np.zeros([n_clusters, n_features])
rows, cols = x.nonzero()
for (sample, word) in zip(rows, cols):
cluster = labels[sample] - 1
w[cluster, word] += 1
row_sum = np.sum(w, 1) # Number of words in i-th cluster
column_sum = np.sum(w, 0) # Number of clusters that contain j-th word
w_sum = np.sum(row_sum) # Total number of words
# Compute Chi-R statistics using this matrix
scores = np.zeros([n_features])
for j in range(n_features):
chi = np.zeros([n_clusters]) # Chi-square statistics
r = np.zeros([n_clusters]) # R coefficient
for i in range(n_clusters):
# Contingency matrix
cont = np.array([
[w[i, j], row_sum[i] - w[i, j]],
[column_sum[j] - w[i, j], w_sum - column_sum[j] - row_sum[i] + w[i, j]]
])
# Calculation of expected frequency
def calc_e(wrd, c):
return (cont[wrd, 0] + cont[wrd, 1]) * (cont[0, c] + cont[1, c]) / w_sum
# Compute R coefficient for cluster and Chi-square statistics
r[i] = w[i, j] / calc_e(0, 0)
for a in [0, 1]:
for b in [0, 1]:
expected = calc_e(a, b)
chi[i] += (cont[a, b] - expected) ** 2 / expected
# Select clusters with positive dependence with this term
dependent = r > 1
r_dependent = r[dependent]
chi_dependent = chi[dependent]
# Compute score
scores[j] = (r_dependent / np.sum(r_dependent)).dot(chi_dependent)
return scores