Skip to content

Commit

Permalink
Added fighting-words term importance to clustering models
Browse files Browse the repository at this point in the history
  • Loading branch information
x-tabdeveloping committed Dec 10, 2024
1 parent 4733be8 commit e4f8fc0
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 7 deletions.
57 changes: 57 additions & 0 deletions turftopic/feature_importance.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from __future__ import annotations

from typing import Literal

import numpy as np
import scipy.sparse as spr
from sklearn.metrics.pairwise import cosine_similarity
Expand Down Expand Up @@ -126,3 +130,56 @@ def bayes_rule(
p_tw = (p_wt.T * p_t).T / p_w
p_tw /= np.nansum(p_tw, axis=0)
return p_tw


def fighting_words(
doc_topic_matrix: np.ndarray,
doc_term_matrix: spr.csr_matrix,
prior: float | Literal["corpus"] = "corpus",
) -> np.ndarray:
"""Computes feature importance using the *Fighting Words* algorithm.
Parameters
----------
doc_topic_matrix: np.ndarray
Document-topic matrix of shape (n_documents, n_topics)
doc_term_matrix: np.ndarray
Document-term matrix of shape (n_documents, vocab_size)
prior: float or "corpus", default "corpus"
Dirichlet prior to use. When a float, it indicates the alpha
parameter of a symmetric Dirichlet, if "corpus",
word frequencies from the background corpus are used.
Returns
-------
ndarray of shape (n_topics, vocab_size)
Term importance matrix.
"""
labels = np.argmax(doc_topic_matrix, axis=1)
n_topics = doc_topic_matrix.shape[1]
n_vocab = doc_term_matrix.shape[1]
components = []
if prior == "corpus":
priors = np.ravel(np.asarray(doc_term_matrix.sum(axis=0)))
else:
priors = np.full(n_vocab, prior)
a0 = np.sum(priors) # prior * n_vocab
for i_topic in range(n_topics):
topic_freq = np.ravel(
np.asarray(doc_term_matrix[labels == i_topic].sum(axis=0))
)
rest_freq = np.ravel(
np.asarray(doc_term_matrix[labels != i_topic].sum(axis=0))
)
n1 = np.sum(topic_freq)
n2 = np.sum(rest_freq)
topic_logodds = np.log(
(topic_freq + priors) / (n1 + a0 - topic_freq - priors)
)
rest_logodds = np.log(
(rest_freq + priors) / (n2 + a0 - rest_freq - priors)
)
delta = topic_logodds - rest_logodds
delta_var = 1 / (topic_freq + priors) + 1 / (rest_freq + priors)
zscore = delta / np.sqrt(delta_var)
components.append(zscore)
return np.stack(components)
24 changes: 17 additions & 7 deletions turftopic/models/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from turftopic.dynamic import DynamicTopicModel
from turftopic.feature_importance import (bayes_rule,
cluster_centroid_distance, ctf_idf,
soft_ctf_idf)
fighting_words, soft_ctf_idf)
from turftopic.vectorizer import default_vectorizer

integer_message = """
Expand All @@ -39,7 +39,7 @@
"""

feature_message = """
feature_importance must be one of 'soft-c-tf-idf', 'c-tf-idf', 'centroid'
feature_importance must be one of 'soft-c-tf-idf', 'c-tf-idf', 'centroid', 'fighting_words'
"""

NOT_MATCHING_ERROR = (
Expand Down Expand Up @@ -152,14 +152,14 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel):
Clustering method to use for finding topics.
Defaults to OPTICS with 25 minimum cluster size.
To imitate the behavior of BERTopic or Top2Vec you should use HDBSCAN.
feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf'
feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'fighting-words', 'centroid'}, default 'soft-c-tf-idf'
Method for estimating term importances.
'centroid' uses distances from cluster centroid similarly
to Top2Vec.
'c-tf-idf' uses BERTopic's c-tf-idf.
'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
be very similar to 'c-tf-idf'.
'bayes' uses Bayes' rule.
'fighting-words', uses the fighting-words algorithm (a Bayesian probabilistic model).
n_reduce_to: int, default None
Number of topics to reduce topics to.
The specified reduction method will be used to merge them.
Expand Down Expand Up @@ -188,6 +188,7 @@ def __init__(
"soft-c-tf-idf",
"centroid",
"bayes",
"fighting-words",
] = "soft-c-tf-idf",
n_reduce_to: Optional[int] = None,
reduction_method: Literal[
Expand All @@ -202,6 +203,7 @@ def __init__(
"soft-c-tf-idf",
"centroid",
"bayes",
"fighting-words",
]:
raise ValueError(feature_message)
if isinstance(encoder, int):
Expand Down Expand Up @@ -364,21 +366,21 @@ def reset_topics(self):
def estimate_components(
self,
feature_importance: Literal[
"centroid", "soft-c-tf-idf", "bayes", "c-tf-idf"
"centroid", "soft-c-tf-idf", "bayes", "c-tf-idf", "fighting-words"
],
) -> np.ndarray:
"""Estimates feature importances based on a fitted clustering.
Parameters
----------
feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf'
feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid', 'fighting-words'}, default 'soft-c-tf-idf'
Method for estimating term importances.
'centroid' uses distances from cluster centroid similarly
to Top2Vec.
'c-tf-idf' uses BERTopic's c-tf-idf.
'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
be very similar to 'c-tf-idf'.
'bayes' uses Bayes' rule.
'fighting-words', uses the fighting-words algorithm (a Bayesian probabilistic model).
Returns
-------
Expand Down Expand Up @@ -426,6 +428,10 @@ def estimate_components(
self.components_ = bayes_rule(
document_topic_matrix, self.doc_term_matrix
)
elif feature_importance == "fighting-words":
self.components_ = fighting_words(
document_topic_matrix, self.doc_term_matrix
)
else:
self.components_ = ctf_idf(
document_topic_matrix, self.doc_term_matrix
Expand Down Expand Up @@ -556,6 +562,10 @@ def estimate_temporal_components(
self.temporal_components_[i_timebin] = bayes_rule(
t_doc_topic, t_dtm
)
elif feature_importance == "fighting-words":
self.temporal_components_[i_timebin] = fighting_words(
t_doc_topic, t_dtm
)
elif feature_importance == "centroid":
t_topic_vectors = self._calculate_topic_vectors(
time_labels == i_timebin,
Expand Down

0 comments on commit e4f8fc0

Please sign in to comment.