diff --git a/turftopic/feature_importance.py b/turftopic/feature_importance.py index ea1fb32..1b2442a 100644 --- a/turftopic/feature_importance.py +++ b/turftopic/feature_importance.py @@ -1,3 +1,7 @@ +from __future__ import annotations + +from typing import Literal + import numpy as np import scipy.sparse as spr from sklearn.metrics.pairwise import cosine_similarity @@ -126,3 +130,56 @@ def bayes_rule( p_tw = (p_wt.T * p_t).T / p_w p_tw /= np.nansum(p_tw, axis=0) return p_tw + + +def fighting_words( + doc_topic_matrix: np.ndarray, + doc_term_matrix: spr.csr_matrix, + prior: float | Literal["corpus"] = "corpus", +) -> np.ndarray: + """Computes feature importance using the *Fighting Words* algorithm. + + Parameters + ---------- + doc_topic_matrix: np.ndarray + Document-topic matrix of shape (n_documents, n_topics) + doc_term_matrix: np.ndarray + Document-term matrix of shape (n_documents, vocab_size) + prior: float or "corpus", default "corpus" + Dirichlet prior to use. When a float, it indicates the alpha + parameter of a symmetric Dirichlet, if "corpus", + word frequencies from the background corpus are used. + Returns + ------- + ndarray of shape (n_topics, vocab_size) + Term importance matrix. + """ + labels = np.argmax(doc_topic_matrix, axis=1) + n_topics = doc_topic_matrix.shape[1] + n_vocab = doc_term_matrix.shape[1] + components = [] + if prior == "corpus": + priors = np.ravel(np.asarray(doc_term_matrix.sum(axis=0))) + else: + priors = np.full(n_vocab, prior) + a0 = np.sum(priors) # prior * n_vocab + for i_topic in range(n_topics): + topic_freq = np.ravel( + np.asarray(doc_term_matrix[labels == i_topic].sum(axis=0)) + ) + rest_freq = np.ravel( + np.asarray(doc_term_matrix[labels != i_topic].sum(axis=0)) + ) + n1 = np.sum(topic_freq) + n2 = np.sum(rest_freq) + topic_logodds = np.log( + (topic_freq + priors) / (n1 + a0 - topic_freq - priors) + ) + rest_logodds = np.log( + (rest_freq + priors) / (n2 + a0 - rest_freq - priors) + ) + delta = topic_logodds - rest_logodds + delta_var = 1 / (topic_freq + priors) + 1 / (rest_freq + priors) + zscore = delta / np.sqrt(delta_var) + components.append(zscore) + return np.stack(components) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index a6eea27..8f442f4 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -20,7 +20,7 @@ from turftopic.dynamic import DynamicTopicModel from turftopic.feature_importance import (bayes_rule, cluster_centroid_distance, ctf_idf, - soft_ctf_idf) + fighting_words, soft_ctf_idf) from turftopic.vectorizer import default_vectorizer integer_message = """ @@ -39,7 +39,7 @@ """ feature_message = """ -feature_importance must be one of 'soft-c-tf-idf', 'c-tf-idf', 'centroid' +feature_importance must be one of 'soft-c-tf-idf', 'c-tf-idf', 'centroid', 'fighting_words' """ NOT_MATCHING_ERROR = ( @@ -152,14 +152,14 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel): Clustering method to use for finding topics. Defaults to OPTICS with 25 minimum cluster size. To imitate the behavior of BERTopic or Top2Vec you should use HDBSCAN. - feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf' + feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'fighting-words', 'centroid'}, default 'soft-c-tf-idf' Method for estimating term importances. 'centroid' uses distances from cluster centroid similarly to Top2Vec. 'c-tf-idf' uses BERTopic's c-tf-idf. 'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should be very similar to 'c-tf-idf'. - 'bayes' uses Bayes' rule. + 'fighting-words', uses the fighting-words algorithm (a Bayesian probabilistic model). n_reduce_to: int, default None Number of topics to reduce topics to. The specified reduction method will be used to merge them. @@ -188,6 +188,7 @@ def __init__( "soft-c-tf-idf", "centroid", "bayes", + "fighting-words", ] = "soft-c-tf-idf", n_reduce_to: Optional[int] = None, reduction_method: Literal[ @@ -202,6 +203,7 @@ def __init__( "soft-c-tf-idf", "centroid", "bayes", + "fighting-words", ]: raise ValueError(feature_message) if isinstance(encoder, int): @@ -364,21 +366,21 @@ def reset_topics(self): def estimate_components( self, feature_importance: Literal[ - "centroid", "soft-c-tf-idf", "bayes", "c-tf-idf" + "centroid", "soft-c-tf-idf", "bayes", "c-tf-idf", "fighting-words" ], ) -> np.ndarray: """Estimates feature importances based on a fitted clustering. Parameters ---------- - feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf' + feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid', 'fighting-words'}, default 'soft-c-tf-idf' Method for estimating term importances. 'centroid' uses distances from cluster centroid similarly to Top2Vec. 'c-tf-idf' uses BERTopic's c-tf-idf. 'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should be very similar to 'c-tf-idf'. - 'bayes' uses Bayes' rule. + 'fighting-words', uses the fighting-words algorithm (a Bayesian probabilistic model). Returns ------- @@ -426,6 +428,10 @@ def estimate_components( self.components_ = bayes_rule( document_topic_matrix, self.doc_term_matrix ) + elif feature_importance == "fighting-words": + self.components_ = fighting_words( + document_topic_matrix, self.doc_term_matrix + ) else: self.components_ = ctf_idf( document_topic_matrix, self.doc_term_matrix @@ -556,6 +562,10 @@ def estimate_temporal_components( self.temporal_components_[i_timebin] = bayes_rule( t_doc_topic, t_dtm ) + elif feature_importance == "fighting-words": + self.temporal_components_[i_timebin] = fighting_words( + t_doc_topic, t_dtm + ) elif feature_importance == "centroid": t_topic_vectors = self._calculate_topic_vectors( time_labels == i_timebin,