Added fighting-words term importance to clustering models

x-tabdeveloping · Dec 10, 2024 · e4f8fc0 · e4f8fc0
1 parent 4733be8
commit e4f8fc0
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 7 deletions.
diff --git a/turftopic/feature_importance.py b/turftopic/feature_importance.py
@@ -1,3 +1,7 @@
+from __future__ import annotations
+
+from typing import Literal
+
 import numpy as np
 import scipy.sparse as spr
 from sklearn.metrics.pairwise import cosine_similarity
@@ -126,3 +130,56 @@ def bayes_rule(
     p_tw = (p_wt.T * p_t).T / p_w
     p_tw /= np.nansum(p_tw, axis=0)
     return p_tw
+
+
+def fighting_words(
+    doc_topic_matrix: np.ndarray,
+    doc_term_matrix: spr.csr_matrix,
+    prior: float | Literal["corpus"] = "corpus",
+) -> np.ndarray:
+    """Computes feature importance using the *Fighting Words* algorithm.
+
+    Parameters
+    ----------
+    doc_topic_matrix: np.ndarray
+        Document-topic matrix of shape (n_documents, n_topics)
+    doc_term_matrix: np.ndarray
+        Document-term matrix of shape (n_documents, vocab_size)
+    prior: float or "corpus", default "corpus"
+        Dirichlet prior to use. When a float, it indicates the alpha
+        parameter of a symmetric Dirichlet, if "corpus",
+        word frequencies from the background corpus are used.
+    Returns
+    -------
+    ndarray of shape (n_topics, vocab_size)
+        Term importance matrix.
+    """
+    labels = np.argmax(doc_topic_matrix, axis=1)
+    n_topics = doc_topic_matrix.shape[1]
+    n_vocab = doc_term_matrix.shape[1]
+    components = []
+    if prior == "corpus":
+        priors = np.ravel(np.asarray(doc_term_matrix.sum(axis=0)))
+    else:
+        priors = np.full(n_vocab, prior)
+    a0 = np.sum(priors)  # prior * n_vocab
+    for i_topic in range(n_topics):
+        topic_freq = np.ravel(
+            np.asarray(doc_term_matrix[labels == i_topic].sum(axis=0))
+        )
+        rest_freq = np.ravel(
+            np.asarray(doc_term_matrix[labels != i_topic].sum(axis=0))
+        )
+        n1 = np.sum(topic_freq)
+        n2 = np.sum(rest_freq)
+        topic_logodds = np.log(
+            (topic_freq + priors) / (n1 + a0 - topic_freq - priors)
+        )
+        rest_logodds = np.log(
+            (rest_freq + priors) / (n2 + a0 - rest_freq - priors)
+        )
+        delta = topic_logodds - rest_logodds
+        delta_var = 1 / (topic_freq + priors) + 1 / (rest_freq + priors)
+        zscore = delta / np.sqrt(delta_var)
+        components.append(zscore)
+    return np.stack(components)
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
@@ -20,7 +20,7 @@
 from turftopic.dynamic import DynamicTopicModel
 from turftopic.feature_importance import (bayes_rule,
                                           cluster_centroid_distance, ctf_idf,
-                                          soft_ctf_idf)
+                                          fighting_words, soft_ctf_idf)
 from turftopic.vectorizer import default_vectorizer
 
 integer_message = """
@@ -39,7 +39,7 @@
 """
 
 feature_message = """
-feature_importance must be one of 'soft-c-tf-idf', 'c-tf-idf', 'centroid'
+feature_importance must be one of 'soft-c-tf-idf', 'c-tf-idf', 'centroid', 'fighting_words'
 """
 
 NOT_MATCHING_ERROR = (
@@ -152,14 +152,14 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel):
         Clustering method to use for finding topics.
         Defaults to OPTICS with 25 minimum cluster size.
         To imitate the behavior of BERTopic or Top2Vec you should use HDBSCAN.
-    feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf'
+    feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'fighting-words', 'centroid'}, default 'soft-c-tf-idf'
         Method for estimating term importances.
         'centroid' uses distances from cluster centroid similarly
         to Top2Vec.
         'c-tf-idf' uses BERTopic's c-tf-idf.
         'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
         be very similar to 'c-tf-idf'.
-        'bayes' uses Bayes' rule.
+        'fighting-words', uses the fighting-words algorithm (a Bayesian probabilistic model).
     n_reduce_to: int, default None
         Number of topics to reduce topics to.
         The specified reduction method will be used to merge them.
@@ -188,6 +188,7 @@ def __init__(
             "soft-c-tf-idf",
             "centroid",
             "bayes",
+            "fighting-words",
         ] = "soft-c-tf-idf",
         n_reduce_to: Optional[int] = None,
         reduction_method: Literal[
@@ -202,6 +203,7 @@ def __init__(
             "soft-c-tf-idf",
             "centroid",
             "bayes",
+            "fighting-words",
         ]:
             raise ValueError(feature_message)
         if isinstance(encoder, int):
@@ -364,21 +366,21 @@ def reset_topics(self):
     def estimate_components(
         self,
         feature_importance: Literal[
-            "centroid", "soft-c-tf-idf", "bayes", "c-tf-idf"
+            "centroid", "soft-c-tf-idf", "bayes", "c-tf-idf", "fighting-words"
         ],
     ) -> np.ndarray:
         """Estimates feature importances based on a fitted clustering.
 
         Parameters
         ----------
-        feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf'
+        feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid', 'fighting-words'}, default 'soft-c-tf-idf'
             Method for estimating term importances.
             'centroid' uses distances from cluster centroid similarly
             to Top2Vec.
             'c-tf-idf' uses BERTopic's c-tf-idf.
             'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
             be very similar to 'c-tf-idf'.
-            'bayes' uses Bayes' rule.
+            'fighting-words', uses the fighting-words algorithm (a Bayesian probabilistic model).
 
         Returns
         -------
@@ -426,6 +428,10 @@ def estimate_components(
             self.components_ = bayes_rule(
                 document_topic_matrix, self.doc_term_matrix
             )
+        elif feature_importance == "fighting-words":
+            self.components_ = fighting_words(
+                document_topic_matrix, self.doc_term_matrix
+            )
         else:
             self.components_ = ctf_idf(
                 document_topic_matrix, self.doc_term_matrix
@@ -556,6 +562,10 @@ def estimate_temporal_components(
                 self.temporal_components_[i_timebin] = bayes_rule(
                     t_doc_topic, t_dtm
                 )
+            elif feature_importance == "fighting-words":
+                self.temporal_components_[i_timebin] = fighting_words(
+                    t_doc_topic, t_dtm
+                )
             elif feature_importance == "centroid":
                 t_topic_vectors = self._calculate_topic_vectors(
                     time_labels == i_timebin,