From ffb51a29ae40e5082eba381a1a41d1d0a47e6c20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 2 Aug 2024 13:06:59 +0200 Subject: [PATCH 01/13] Added Bayes' rule feature importance to clustering models --- turftopic/feature_importance.py | 35 +++++++++++++++++++++++++++++++++ turftopic/models/cluster.py | 19 ++++++++++++++++-- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/turftopic/feature_importance.py b/turftopic/feature_importance.py index 0657dd7..f329b79 100644 --- a/turftopic/feature_importance.py +++ b/turftopic/feature_importance.py @@ -1,6 +1,7 @@ import numpy as np import scipy.sparse as spr from sklearn.metrics import pairwise_distances +from sklearn.preprocessing import normalize def cluster_centroid_distance( @@ -94,3 +95,37 @@ def ctf_idf( component = freq * np.log(1 + average / overall_freq) components.append(component) return np.stack(components) + + +def bayes_rule( + doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix +) -> np.ndarray: + """Computes feature importance based on Bayes' rule. + The importance of a word for a topic is the probability of the topic conditional on the word. + + $$p(t|w) = \\frac{p(w|t) * p(t)}{p(w)}$$ + + Parameters + ---------- + doc_topic_matrix: np.ndarray + Document-topic matrix of shape (n_documents, n_topics) + doc_term_matrix: np.ndarray + Document-term matrix of shape (n_documents, vocab_size) + + Returns + ------- + ndarray of shape (n_topics, vocab_size) + Term importance matrix. + """ + eps = np.finfo(float).eps + p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0))) + p_w = p_w / p_w.sum() + p_t = doc_topic_matrix.sum(axis=0) + p_t = p_t / p_t.sum() + term_importance = doc_topic_matrix.T @ doc_term_matrix + overall_in_topic = np.abs(term_importance).sum(axis=1) + p_wt = (term_importance.T / (overall_in_topic + eps)).T + p_wt = normalize(p_wt, norm="l1", axis=1) + p_tw = (p_wt.T * p_t).T / p_w + p_tw = normalize(p_tw, axis=0, norm="l1") + return p_tw diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 0318546..a31f802 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -14,6 +14,7 @@ from turftopic.base import ContextualModel, Encoder from turftopic.dynamic import DynamicTopicModel from turftopic.feature_importance import ( + bayes_rule, cluster_centroid_distance, ctf_idf, soft_ctf_idf, @@ -156,7 +157,10 @@ def __init__( dimensionality_reduction: Optional[TransformerMixin] = None, clustering: Optional[ClusterMixin] = None, feature_importance: Literal[ - "c-tf-idf", "soft-c-tf-idf", "centroid" + "c-tf-idf", + "soft-c-tf-idf", + "centroid", + "bayes", ] = "soft-c-tf-idf", n_reduce_to: Optional[int] = None, reduction_method: Literal[ @@ -166,7 +170,12 @@ def __init__( ): self.encoder = encoder self.random_state = random_state - if feature_importance not in ["c-tf-idf", "soft-c-tf-idf", "centroid"]: + if feature_importance not in [ + "c-tf-idf", + "soft-c-tf-idf", + "centroid", + "bayes", + ]: raise ValueError(feature_message) if isinstance(encoder, int): raise TypeError(integer_message) @@ -256,6 +265,10 @@ def _estimate_parameters( self.vocab_embeddings, metric="cosine", ) + elif self.feature_importance == "bayes": + self.components_ = bayes_rule( + document_topic_matrix, doc_term_matrix + ) else: self.components_ = ctf_idf(document_topic_matrix, doc_term_matrix) @@ -368,6 +381,8 @@ def fit_transform_dynamic( ) elif self.feature_importance == "c-tf-idf": components = ctf_idf(t_doc_topic_matrix, t_doc_term_matrix) + elif self.feature_importance == "bayes": + components = bayes_rule(t_doc_topic_matrix, t_doc_term_matrix) elif self.feature_importance == "centroid": time_index = time_labels == i_timebin t_topic_vectors = calculate_topic_vectors( From e6b721a76c833ae5d6a4b856e8bf30eb4e060703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 2 Aug 2024 13:25:13 +0200 Subject: [PATCH 02/13] Made clustering models' feature importance estimation more efficient --- turftopic/models/cluster.py | 44 +++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index a31f802..26bcc14 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -12,6 +12,7 @@ from sklearn.preprocessing import label_binarize from turftopic.base import ContextualModel, Encoder +from turftopic.data import TopicData from turftopic.dynamic import DynamicTopicModel from turftopic.feature_importance import ( bayes_rule, @@ -238,28 +239,32 @@ def _merge_smallest(self, n_reduce_to: int): labels[labels == from_topic] = to_topic return labels - def _estimate_parameters( + def estimate_components( self, - embeddings: np.ndarray, - doc_term_matrix: np.ndarray, - ): + feature_importance: Literal[ + "centroid", "soft_ctf_idf", "bayes", "c-tf-idf" + ], + ) -> np.array: clusters = np.unique(self.labels_) self.classes_ = np.sort(clusters) self.topic_sizes_ = np.array( [np.sum(self.labels_ == label) for label in self.classes_] ) - self.topic_vectors_ = calculate_topic_vectors(self.labels_, embeddings) - self.vocab_embeddings = self.encoder_.encode( - self.vectorizer.get_feature_names_out() - ) # type: ignore + self.topic_vectors_ = calculate_topic_vectors( + self.labels_, self.embeddings + ) document_topic_matrix = label_binarize( self.labels_, classes=self.classes_ ) if self.feature_importance == "soft-c-tf-idf": self.components_ = soft_ctf_idf( - document_topic_matrix, doc_term_matrix + document_topic_matrix, self.doc_term_matrix ) # type: ignore elif self.feature_importance == "centroid": + if not hasattr(self, "vocab_embeddings"): + self.vocab_embeddings = self.encoder_.encode( + self.vectorizer.get_feature_names_out() + ) # type: ignore self.components_ = cluster_centroid_distance( self.topic_vectors_, self.vocab_embeddings, @@ -267,10 +272,13 @@ def _estimate_parameters( ) elif self.feature_importance == "bayes": self.components_ = bayes_rule( - document_topic_matrix, doc_term_matrix + document_topic_matrix, self.doc_term_matrix ) else: - self.components_ = ctf_idf(document_topic_matrix, doc_term_matrix) + self.components_ = ctf_idf( + document_topic_matrix, self.doc_term_matrix + ) + return self.components_ def fit_predict( self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None @@ -296,6 +304,7 @@ def fit_predict( if embeddings is None: status.update("Encoding documents") embeddings = self.encoder_.encode(raw_documents) + self.embeddings = embeddings console.log("Encoding done.") status.update("Extracting terms") self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents) @@ -309,10 +318,7 @@ def fit_predict( self.labels_ = self.clustering.fit_predict(reduced_embeddings) console.log("Clustering done.") status.update("Estimating parameters.") - self._estimate_parameters( - embeddings, - self.doc_term_matrix, - ) + self.estimate_components(self.feature_importance) console.log("Parameter estimation done.") if self.n_reduce_to is not None: n_topics = self.classes_.shape[0] @@ -327,12 +333,12 @@ def fit_predict( f"Topic reduction done from {n_topics} to {self.n_reduce_to}." ) status.update("Reestimating parameters.") - self._estimate_parameters( - embeddings, - self.doc_term_matrix, - ) + self.estimate_components(self.feature_importance) console.log("Reestimation done.") console.log("Model fitting done.") + self.doc_term_matrix = label_binarize( + self.labels_, classes=self.classes_ + ) return self.labels_ def fit_transform( From 5692b73c5f6d7b94e4a14433875935b17dd51475 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 2 Aug 2024 13:33:19 +0200 Subject: [PATCH 03/13] Added reduce_topics method to clustering models --- turftopic/models/cluster.py | 46 ++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 26bcc14..4954695 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -239,12 +239,56 @@ def _merge_smallest(self, n_reduce_to: int): labels[labels == from_topic] = to_topic return labels + def reduce_topics( + self, + n_reduce_to: int, + reduction_method: Literal["smallest", "agglomerative"], + ) -> np.ndarray: + """Reduces the clustering to the desired amount with the given method. + + Parameters + ---------- + n_reduce_to: int, default None + Number of topics to reduce topics to. + The specified reduction method will be used to merge them. + By default, topics are not merged. + reduction_method: 'agglomerative', 'smallest' + Method used to reduce the number of topics post-hoc. + When 'agglomerative', BERTopic's topic reduction method is used, + where topic vectors are hierarchically clustered. + When 'smallest', the smallest topic gets merged into the closest + non-outlier cluster until the desired number + is achieved similarly to Top2Vec. + + Returns + ------- + ndarray of shape (n_documents) + New cluster labels for documents. + """ + if reduction_method == "smallest": + self.labels_ = self._merge_smallest(n_reduce_to) + elif reduction_method == "agglomerative": + self.labels_ = self._merge_agglomerative(n_reduce_to) + return self.labels_ + def estimate_components( self, feature_importance: Literal[ - "centroid", "soft_ctf_idf", "bayes", "c-tf-idf" + "centroid", "soft-c-tf-idf", "bayes", "c-tf-idf" ], ) -> np.array: + """Estimates feature importances based on a fitted clustering. + + Parameters + ---------- + feature_importance: {'centroid', 'soft-c-tf-idf', 'bayes' 'c-tf-idf'} + Estimation method. + + Returns + ------- + ndarray of shape (n_components, n_vocab) + Topic-term matrix. + """ clusters = np.unique(self.labels_) self.classes_ = np.sort(clusters) self.topic_sizes_ = np.array( From 963c89f3b689baae6941213912969e2758cd6d52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 2 Aug 2024 13:36:02 +0200 Subject: [PATCH 04/13] Fixed docstrings for feature_importance --- turftopic/models/cluster.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 4954695..b6bb3ee 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -127,13 +127,14 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel): Clustering method to use for finding topics. Defaults to OPTICS with 25 minimum cluster size. To imitate the behavior of BERTopic or Top2Vec you should use HDBSCAN. - feature_importance: 'soft-c-tf-idf', 'c-tf-idf' or 'centroid', default 'soft-c-tf-idf' + feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf' Method for estimating term importances. 'centroid' uses distances from cluster centroid similarly to Top2Vec. 'c-tf-idf' uses BERTopic's c-tf-idf. 'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should be very similar to 'c-tf-idf'. + 'bayes' uses Bayes' rule. n_reduce_to: int, default None Number of topics to reduce topics to. The specified reduction method will be used to merge them. @@ -281,8 +282,14 @@ def estimate_components( Parameters ---------- - feature_importance: {'centroid', 'soft-c-tf-idf', 'bayes' 'c-tf-idf'} - Estimation method. + feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf' + Method for estimating term importances. + 'centroid' uses distances from cluster centroid similarly + to Top2Vec. + 'c-tf-idf' uses BERTopic's c-tf-idf. + 'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should + be very similar to 'c-tf-idf'. + 'bayes' uses Bayes' rule. Returns ------- From a9bb21c16a12fee18c0fdc0c5a2f4cb0fbe619f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 2 Aug 2024 13:43:09 +0200 Subject: [PATCH 05/13] Added option for resetting topic reduction --- turftopic/models/cluster.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index b6bb3ee..d6fae2d 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -1,3 +1,4 @@ +import warnings from datetime import datetime from typing import Literal, Optional, Union @@ -12,7 +13,6 @@ from sklearn.preprocessing import label_binarize from turftopic.base import ContextualModel, Encoder -from turftopic.data import TopicData from turftopic.dynamic import DynamicTopicModel from turftopic.feature_importance import ( bayes_rule, @@ -266,12 +266,21 @@ def reduce_topics( ndarray of shape (n_documents) New cluster labels for documents. """ + if not hasattr(self, "original_labels_"): + self.original_labels_ = self.labels_ if reduction_method == "smallest": self.labels_ = self._merge_smallest(n_reduce_to) elif reduction_method == "agglomerative": self.labels_ = self._merge_agglomerative(n_reduce_to) return self.labels_ + def reset_reduction(self): + if not hasattr(self, "original_labels_"): + warnings.warn("Topics have never been reduced, nothing to reset.") + else: + self.labels_ = self.original_labels_ + self.estimate_components(self.feature_importance) + def estimate_components( self, feature_importance: Literal[ @@ -376,10 +385,7 @@ def fit_predict( status.update( f"Reducing topics from {n_topics} to {self.n_reduce_to}" ) - if self.reduction_method == "agglomerative": - self.labels_ = self._merge_agglomerative(self.n_reduce_to) - else: - self.labels_ = self._merge_smallest(self.n_reduce_to) + self.reduce_topics(self.n_reduce_to, self.reduction_method) console.log( f"Topic reduction done from {n_topics} to {self.n_reduce_to}." ) From 76789f31158a62ff943f3e74817b4bbf9b071312 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 2 Aug 2024 13:43:40 +0200 Subject: [PATCH 06/13] Added docstring to reset_reduction --- turftopic/models/cluster.py | 1 + 1 file changed, 1 insertion(+) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index d6fae2d..814ecb5 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -275,6 +275,7 @@ def reduce_topics( return self.labels_ def reset_reduction(self): + """Resets topic reductions to the original clustering.""" if not hasattr(self, "original_labels_"): warnings.warn("Topics have never been reduced, nothing to reset.") else: From 7d98e12ee3345bb1a0ebbb9271bd310c3cda0f09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 2 Aug 2024 13:44:47 +0200 Subject: [PATCH 07/13] Version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 384d349..ac0d284 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ line-length=79 [tool.poetry] name = "turftopic" -version = "0.5.0" +version = "0.5.1" description = "Topic modeling with contextual representations from sentence transformers." authors = ["Márton Kardos "] license = "MIT" From 10d27114e163225dcc38a681af2a221e727dcbe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 2 Aug 2024 13:46:57 +0200 Subject: [PATCH 08/13] Fixed bug with embeddings --- turftopic/models/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 814ecb5..95629a3 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -365,8 +365,8 @@ def fit_predict( if embeddings is None: status.update("Encoding documents") embeddings = self.encoder_.encode(raw_documents) - self.embeddings = embeddings console.log("Encoding done.") + self.embeddings = embeddings status.update("Extracting terms") self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents) console.log("Term extraction done.") From 660a91f520dd1b30a24db82dd7048b932a0dbb4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 5 Aug 2024 10:01:14 +0200 Subject: [PATCH 09/13] Made term importance estimation more immune to missing values --- turftopic/feature_importance.py | 33 +++++---- turftopic/models/cluster.py | 115 ++++++++++++++++++++------------ 2 files changed, 87 insertions(+), 61 deletions(-) diff --git a/turftopic/feature_importance.py b/turftopic/feature_importance.py index f329b79..ea1fb32 100644 --- a/turftopic/feature_importance.py +++ b/turftopic/feature_importance.py @@ -1,13 +1,11 @@ import numpy as np import scipy.sparse as spr -from sklearn.metrics import pairwise_distances -from sklearn.preprocessing import normalize +from sklearn.metrics.pairwise import cosine_similarity def cluster_centroid_distance( cluster_centroids: np.ndarray, vocab_embeddings: np.ndarray, - metric="cosine", ) -> np.ndarray: """Computes feature importances based on distances between topic vectors (cluster centroids) and term embeddings @@ -18,25 +16,21 @@ def cluster_centroid_distance( Coordinates of cluster centroids of shape (n_topics, embedding_size) vocab_embeddings: np.ndarray Term embeddings of shape (vocab_size, embedding_size) - metric: str, defaul 'cosine' - Metric used to compute distance from centroid. - See documentation for sklearn.metrics.pairwise.distance_metrics - for valid values. Returns ------- ndarray of shape (n_topics, vocab_size) Term importance matrix. """ - distances = pairwise_distances( - cluster_centroids, vocab_embeddings, metric=metric + n_components = cluster_centroids.shape[0] + n_vocab = vocab_embeddings.shape[0] + components = np.full((n_components, n_vocab), np.nan) + valid_centroids = np.all(np.isfinite(cluster_centroids), axis=1) + similarities = cosine_similarity( + cluster_centroids[valid_centroids], vocab_embeddings ) - similarities = -distances / np.max(distances) - # Z-score transformation - similarities = (similarities - np.mean(similarities)) / np.std( - similarities - ) - return similarities + components[valid_centroids, :] = similarities + return components def soft_ctf_idf( @@ -88,6 +82,7 @@ def ctf_idf( components = [] overall_freq = np.ravel(np.asarray(doc_term_matrix.sum(axis=0))) average = overall_freq.sum() / n_topics + overall_freq[overall_freq == 0] = np.finfo(float).eps for i_topic in range(n_topics): freq = np.ravel( np.asarray(doc_term_matrix[labels == i_topic].sum(axis=0)) @@ -120,12 +115,14 @@ def bayes_rule( eps = np.finfo(float).eps p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0))) p_w = p_w / p_w.sum() + p_w[p_w <= 0] = eps p_t = doc_topic_matrix.sum(axis=0) p_t = p_t / p_t.sum() term_importance = doc_topic_matrix.T @ doc_term_matrix overall_in_topic = np.abs(term_importance).sum(axis=1) - p_wt = (term_importance.T / (overall_in_topic + eps)).T - p_wt = normalize(p_wt, norm="l1", axis=1) + overall_in_topic[overall_in_topic <= 0] = eps + p_wt = (term_importance.T / (overall_in_topic)).T + p_wt /= p_wt.sum(axis=1)[:, None] p_tw = (p_wt.T * p_t).T / p_w - p_tw = normalize(p_tw, axis=0, norm="l1") + p_tw /= np.nansum(p_tw, axis=0) return p_tw diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 95629a3..3cd7eb3 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -203,6 +203,22 @@ def __init__( self.n_reduce_to = n_reduce_to self.reduction_method = reduction_method + def _calculate_topic_vectors( + self, is_in_slice: Optional[np.ndarray] = None + ) -> np.ndarray: + label_to_idx = {label: idx for idx, label in enumerate(self.classes_)} + n_topics = len(self.classes_) + n_dims = self.embeddings.shape[1] + topic_vectors = np.full((n_topics, n_dims), np.nan) + for label in np.unique(self.labels_): + doc_idx = self.labels_ == label + if is_in_slice is not None: + doc_idx = doc_idx & is_in_slice + topic_vectors[label_to_idx[label], :] = np.mean( + self.embeddings[doc_idx], axis=0 + ) + return topic_vectors + def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray: n_topics = self.components_.shape[0] res = {old_label: old_label for old_label in self.classes_} @@ -311,9 +327,7 @@ def estimate_components( self.topic_sizes_ = np.array( [np.sum(self.labels_ == label) for label in self.classes_] ) - self.topic_vectors_ = calculate_topic_vectors( - self.labels_, self.embeddings - ) + self.topic_vectors_ = self._calculate_topic_vectors() document_topic_matrix = label_binarize( self.labels_, classes=self.classes_ ) @@ -329,7 +343,6 @@ def estimate_components( self.components_ = cluster_centroid_distance( self.topic_vectors_, self.vocab_embeddings, - metric="cosine", ) elif self.feature_importance == "bayes": self.components_ = bayes_rule( @@ -394,7 +407,7 @@ def fit_predict( self.estimate_components(self.feature_importance) console.log("Reestimation done.") console.log("Model fitting done.") - self.doc_term_matrix = label_binarize( + self.doc_topic_matrix = label_binarize( self.labels_, classes=self.classes_ ) return self.labels_ @@ -405,6 +418,56 @@ def fit_transform( labels = self.fit_predict(raw_documents, y, embeddings) return label_binarize(labels, classes=self.classes_) + def estimate_temporal_components( + self, + time_labels, + time_bin_edges, + feature_importance: Literal[ + "c-tf-idf", "soft-c-tf-idf", "centroid", "bayes" + ], + ): + n_comp, n_vocab = self.components_.shape + n_bins = len(self.time_bin_edges) - 1 + self.temporal_components_ = np.full( + (n_bins, n_comp, n_vocab), + np.nan, + dtype=self.components_.dtype, + ) + self.temporal_importance_ = np.zeros((n_bins, n_comp)) + for i_timebin in np.unique(time_labels): + topic_importances = self.doc_topic_matrix[ + time_labels == i_timebin + ].sum(axis=0) + if not topic_importances.sum() == 0: + topic_importances = topic_importances / topic_importances.sum() + self.temporal_importance_[i_timebin, :] = topic_importances + t_dtm = self.doc_term_matrix[time_labels == i_timebin] + t_doc_topic = self.doc_topic_matrix[time_labels == i_timebin] + if feature_importance == "c-tf-idf": + self.temporal_components_[i_timebin] = ctf_idf( + t_doc_topic, t_dtm + ) + elif feature_importance == "soft-c-tf-idf": + self.temporal_components_[i_timebin] = soft_ctf_idf( + t_doc_topic, t_dtm + ) + elif feature_importance == "bayes": + self.temporal_components_[i_timebin] = bayes_rule( + t_doc_topic, t_dtm + ) + elif feature_importance == "centroid": + t_topic_vectors = self._calculate_topic_vectors( + time_labels == i_timebin, + ) + components = cluster_centroid_distance( + t_topic_vectors, + self.vocab_embeddings, + ) + mask_terms = t_dtm.sum(axis=0).astype(np.float64) + mask_terms = np.squeeze(np.asarray(mask_terms)) + components[:, mask_terms == 0] = np.nan + self.temporal_components_[i_timebin] = components + def fit_transform_dynamic( self, raw_documents, @@ -431,42 +494,8 @@ def fit_transform_dynamic( self.temporal_importance_ = np.zeros((n_bins, n_comp)) if embeddings is None: embeddings = self.encoder_.encode(raw_documents) - for i_timebin in np.unique(time_labels): - topic_importances = doc_topic_matrix[time_labels == i_timebin].sum( - axis=0 - ) - topic_importances = topic_importances / topic_importances.sum() - t_doc_term_matrix = self.doc_term_matrix[time_labels == i_timebin] - t_doc_topic_matrix = doc_topic_matrix[time_labels == i_timebin] - if "c-tf-idf" in self.feature_importance: - if self.feature_importance == "soft-c-tf-idf": - components = soft_ctf_idf( - t_doc_topic_matrix, t_doc_term_matrix - ) - elif self.feature_importance == "c-tf-idf": - components = ctf_idf(t_doc_topic_matrix, t_doc_term_matrix) - elif self.feature_importance == "bayes": - components = bayes_rule(t_doc_topic_matrix, t_doc_term_matrix) - elif self.feature_importance == "centroid": - time_index = time_labels == i_timebin - t_topic_vectors = calculate_topic_vectors( - self.labels_, - embeddings, - time_index, - ) - topic_mask = np.isnan(t_topic_vectors).all( - axis=1, keepdims=True - ) - t_topic_vectors[:] = 0 - components = cluster_centroid_distance( - t_topic_vectors, - self.vocab_embeddings, - metric="cosine", - ) - components *= topic_mask - mask_terms = t_doc_term_matrix.sum(axis=0).astype(np.float64) - mask_terms[mask_terms == 0] = np.nan - components *= mask_terms - self.temporal_components_[i_timebin] = components - self.temporal_importance_[i_timebin] = topic_importances + self.embeddings = embeddings + self.estimate_temporal_components( + time_labels, self.time_bin_edges, self.feature_importance + ) return doc_topic_matrix From 7ee21fdb1e3452528cb7a80f1a226ad1db00f814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 5 Aug 2024 10:11:47 +0200 Subject: [PATCH 10/13] Fixed estimate_components --- turftopic/models/cluster.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 3cd7eb3..dfbe262 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -14,12 +14,9 @@ from turftopic.base import ContextualModel, Encoder from turftopic.dynamic import DynamicTopicModel -from turftopic.feature_importance import ( - bayes_rule, - cluster_centroid_distance, - ctf_idf, - soft_ctf_idf, -) +from turftopic.feature_importance import (bayes_rule, + cluster_centroid_distance, ctf_idf, + soft_ctf_idf) from turftopic.vectorizer import default_vectorizer integer_message = """ @@ -331,11 +328,11 @@ def estimate_components( document_topic_matrix = label_binarize( self.labels_, classes=self.classes_ ) - if self.feature_importance == "soft-c-tf-idf": + if feature_importance == "soft-c-tf-idf": self.components_ = soft_ctf_idf( document_topic_matrix, self.doc_term_matrix ) # type: ignore - elif self.feature_importance == "centroid": + elif feature_importance == "centroid": if not hasattr(self, "vocab_embeddings"): self.vocab_embeddings = self.encoder_.encode( self.vectorizer.get_feature_names_out() @@ -344,7 +341,7 @@ def estimate_components( self.topic_vectors_, self.vocab_embeddings, ) - elif self.feature_importance == "bayes": + elif feature_importance == "bayes": self.components_ = bayes_rule( document_topic_matrix, self.doc_term_matrix ) From b8688e1b5334ebc52a3110f79a35ccc2bfdc848c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 19 Aug 2024 16:14:02 +0200 Subject: [PATCH 11/13] Added docstrings and meaningful error messages to estimate_components --- turftopic/models/cluster.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index dfbe262..b69181c 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -7,6 +7,7 @@ from sentence_transformers import SentenceTransformer from sklearn.base import ClusterMixin, TransformerMixin from sklearn.cluster import OPTICS, AgglomerativeClustering +from sklearn.exceptions import NotFittedError from sklearn.feature_extraction.text import CountVectorizer from sklearn.manifold import TSNE from sklearn.metrics.pairwise import cosine_distances @@ -300,7 +301,7 @@ def estimate_components( feature_importance: Literal[ "centroid", "soft-c-tf-idf", "bayes", "c-tf-idf" ], - ) -> np.array: + ) -> np.ndarray: """Estimates feature importances based on a fitted clustering. Parameters @@ -319,6 +320,10 @@ def estimate_components( ndarray of shape (n_components, n_vocab) Topic-term matrix. """ + if getattr(self, "labels_", None) is None: + raise NotFittedError( + "The model has not been fitted yet, please fit the model before estimating temporal components." + ) clusters = np.unique(self.labels_) self.classes_ = np.sort(clusters) self.topic_sizes_ = np.array( @@ -422,8 +427,31 @@ def estimate_temporal_components( feature_importance: Literal[ "c-tf-idf", "soft-c-tf-idf", "centroid", "bayes" ], - ): + ) -> np.ndarray: + """Estimates temporal components based on a fitted topic model. + + Parameters + ---------- + feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf' + Method for estimating term importances. + 'centroid' uses distances from cluster centroid similarly + to Top2Vec. + 'c-tf-idf' uses BERTopic's c-tf-idf. + 'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should + be very similar to 'c-tf-idf'. + 'bayes' uses Bayes' rule. + + Returns + ------- + ndarray of shape (n_time_bins, n_components, n_vocab) + Temporal topic-term matrix. + """ + if getattr(self, "components_", None) is None: + raise NotFittedError( + "The model has not been fitted yet, please fit the model before estimating temporal components." + ) n_comp, n_vocab = self.components_.shape + self.time_bin_edges = time_bin_edges n_bins = len(self.time_bin_edges) - 1 self.temporal_components_ = np.full( (n_bins, n_comp, n_vocab), @@ -464,6 +492,7 @@ def estimate_temporal_components( mask_terms = np.squeeze(np.asarray(mask_terms)) components[:, mask_terms == 0] = np.nan self.temporal_components_[i_timebin] = components + return self.temporal_components_ def fit_transform_dynamic( self, From c6ac90b5f5745d305dbebaa3fbc33e8d00e6b589 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Tue, 27 Aug 2024 10:31:39 +0200 Subject: [PATCH 12/13] Started implementing hierarchical topic joining in clustering models --- turftopic/hierarchical.py | 48 +++++++++++++++++++++++++++ turftopic/models/cluster.py | 65 +++++++++++++++++++++++++++++++++++-- 2 files changed, 110 insertions(+), 3 deletions(-) diff --git a/turftopic/hierarchical.py b/turftopic/hierarchical.py index d0a5144..f40f56f 100644 --- a/turftopic/hierarchical.py +++ b/turftopic/hierarchical.py @@ -116,6 +116,20 @@ class TopicNode: document_topic_vector: Optional[np.ndarray] = None children: Optional[list[TopicNode]] = None + @property + def components_(self) -> np.ndarray: + if self.children is None: + raise ValueError("Current node is a leaf, no components.") + return np.stack([child.word_importance for child in self.children]) + + @property + def doc_topic_matrix(self) -> np.ndarray: + if self.children is None: + raise ValueError("Current node is a leaf, no doc_topic_matrix.") + return np.stack( + [child.document_topic_vector for child in self.children] + ).T + @classmethod def create_root( cls, @@ -146,6 +160,14 @@ def create_root( children=children, ) + def set_path(self, path: tuple[int]): + """Sets path for current node and all children accordingly.""" + self.path = path + if self.children is None: + return + for i_child, child in enumerate(self.children): + child.set_path((*self.path, i_child)) + @property def level(self) -> int: """Indicates how deep down the hierarchy the topic is.""" @@ -275,3 +297,29 @@ def divide_children(self, n_subtopics: int, **kwargs): def plot_tree(self): """Plots hierarchy as an interactive tree in Plotly.""" return _tree_plot(self) + + def join(self, *subtopics: int, **kwargs): + slot = min(subtopics) + max_subtopics = max(subtopics) + if len(self.children) < (max_subtopics - 1): + raise ValueError( + "These subtopics don't exist on the current node." + ) + if slot < 0: + raise ValueError( + "Outlier topics (-1) cannot be merged with other topics." + ) + if self.children is None: + raise ValueError( + "Current Node is a leaf, children can't be joined." + ) + try: + self.children[slot] = self.model.join_subtopics( + subtopics, self, **kwargs + ) + self.set_path(self.path) + except AttributeError as e: + raise AttributeError( + "Looks like your model is not an agglomerative hierarchical model." + ) from e + return self diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index b69181c..fa74b0b 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -1,6 +1,6 @@ import warnings from datetime import datetime -from typing import Literal, Optional, Union +from typing import Iterable, Literal, Optional, Union import numpy as np from rich.console import Console @@ -18,6 +18,7 @@ from turftopic.feature_importance import (bayes_rule, cluster_centroid_distance, ctf_idf, soft_ctf_idf) +from turftopic.hierarchical import TopicNode from turftopic.vectorizer import default_vectorizer integer_message = """ @@ -230,11 +231,12 @@ def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray: ] ) old_labels = [label for label in self.classes_ if label != -1] - new_labels = AgglomerativeClustering( + clustering = AgglomerativeClustering( n_clusters=n_reduce_to, metric="cosine", linkage="average", - ).fit_predict(interesting_topic_vectors) + ) + new_labels = clustering.fit_predict(interesting_topic_vectors) res = {} if -1 in self.classes_: res[-1] = -1 @@ -254,6 +256,58 @@ def _merge_smallest(self, n_reduce_to: int): labels[labels == from_topic] = to_topic return labels + def join_subtopics( + self, subtopics: Iterable[int], hierarchy: Optional[TopicNode] = None + ) -> TopicNode: + """Joins subtopics in a topic hierarchy and returns the joint TopicNode. + > Note that this method does not alter the underlying hierarchy! + > You will need to use the join() method of a hierarchy for that. + + Parameters + ---------- + subtopics: iterable of int + Indices of subtopics to be joint. + hierarchy: TopicNode, default None + Hierarchy to join subtopics in, defaults to the root hierarchy of the model. + + Returns + ------- + TopicNode + New topic made up of the joint subtopics. + """ + if hierarchy is None: + hierarchy = self.hierarchy + subtopics = list(set(subtopics)) + slot = min(subtopics) + max_subtopics = max(subtopics) + if len(self.children) < (max_subtopics - 1): + raise ValueError( + "These subtopics don't exist on the current node." + ) + if slot < 0: + raise ValueError( + "Outlier topics (-1) cannot be merged with other topics." + ) + if self.children is None: + raise ValueError( + "Current Node is a leaf, children can't be joined." + ) + path = (*hierarchy.path, slot) + children = [self.hierarchy[sub] for sub in subtopics] + doc_topic_vector = self.hierarchy.doc_topic_matrix[:, subtopics].sum( + axis=1 + ) + rest = [ + doc_topic_vector + for i_topic, doc_topic_vector in enumerate( + self.hierarchy.doc_topic_matrix.T + ) + if i_topic not in subtopics + ] + doc_topic_matrix = np.stack([doc_topic_vector, rest]).T + # TODO + pass + def reduce_topics( self, n_reduce_to: int, @@ -286,6 +340,7 @@ def reduce_topics( self.labels_ = self._merge_smallest(n_reduce_to) elif reduction_method == "agglomerative": self.labels_ = self._merge_agglomerative(n_reduce_to) + self.estimate_components(self.feature_importance) return self.labels_ def reset_reduction(self): @@ -326,6 +381,10 @@ def estimate_components( ) clusters = np.unique(self.labels_) self.classes_ = np.sort(clusters) + if -1 in self.classes_: + # Putting outliers in the last position, so that when you index things, + # it works. + self.classes_ = np.array([*self.classes_[1:], -1]) self.topic_sizes_ = np.array( [np.sum(self.labels_ == label) for label in self.classes_] ) From ef4c28d420adc09f36254321dc0cc0ab369cc4de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Mon, 21 Oct 2024 16:10:43 +0200 Subject: [PATCH 13/13] Revert "Started implementing hierarchical topic joining in clustering models" This reverts commit c6ac90b5f5745d305dbebaa3fbc33e8d00e6b589. --- turftopic/hierarchical.py | 48 --------------------------- turftopic/models/cluster.py | 65 ++----------------------------------- 2 files changed, 3 insertions(+), 110 deletions(-) diff --git a/turftopic/hierarchical.py b/turftopic/hierarchical.py index f40f56f..d0a5144 100644 --- a/turftopic/hierarchical.py +++ b/turftopic/hierarchical.py @@ -116,20 +116,6 @@ class TopicNode: document_topic_vector: Optional[np.ndarray] = None children: Optional[list[TopicNode]] = None - @property - def components_(self) -> np.ndarray: - if self.children is None: - raise ValueError("Current node is a leaf, no components.") - return np.stack([child.word_importance for child in self.children]) - - @property - def doc_topic_matrix(self) -> np.ndarray: - if self.children is None: - raise ValueError("Current node is a leaf, no doc_topic_matrix.") - return np.stack( - [child.document_topic_vector for child in self.children] - ).T - @classmethod def create_root( cls, @@ -160,14 +146,6 @@ def create_root( children=children, ) - def set_path(self, path: tuple[int]): - """Sets path for current node and all children accordingly.""" - self.path = path - if self.children is None: - return - for i_child, child in enumerate(self.children): - child.set_path((*self.path, i_child)) - @property def level(self) -> int: """Indicates how deep down the hierarchy the topic is.""" @@ -297,29 +275,3 @@ def divide_children(self, n_subtopics: int, **kwargs): def plot_tree(self): """Plots hierarchy as an interactive tree in Plotly.""" return _tree_plot(self) - - def join(self, *subtopics: int, **kwargs): - slot = min(subtopics) - max_subtopics = max(subtopics) - if len(self.children) < (max_subtopics - 1): - raise ValueError( - "These subtopics don't exist on the current node." - ) - if slot < 0: - raise ValueError( - "Outlier topics (-1) cannot be merged with other topics." - ) - if self.children is None: - raise ValueError( - "Current Node is a leaf, children can't be joined." - ) - try: - self.children[slot] = self.model.join_subtopics( - subtopics, self, **kwargs - ) - self.set_path(self.path) - except AttributeError as e: - raise AttributeError( - "Looks like your model is not an agglomerative hierarchical model." - ) from e - return self diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index fa74b0b..b69181c 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -1,6 +1,6 @@ import warnings from datetime import datetime -from typing import Iterable, Literal, Optional, Union +from typing import Literal, Optional, Union import numpy as np from rich.console import Console @@ -18,7 +18,6 @@ from turftopic.feature_importance import (bayes_rule, cluster_centroid_distance, ctf_idf, soft_ctf_idf) -from turftopic.hierarchical import TopicNode from turftopic.vectorizer import default_vectorizer integer_message = """ @@ -231,12 +230,11 @@ def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray: ] ) old_labels = [label for label in self.classes_ if label != -1] - clustering = AgglomerativeClustering( + new_labels = AgglomerativeClustering( n_clusters=n_reduce_to, metric="cosine", linkage="average", - ) - new_labels = clustering.fit_predict(interesting_topic_vectors) + ).fit_predict(interesting_topic_vectors) res = {} if -1 in self.classes_: res[-1] = -1 @@ -256,58 +254,6 @@ def _merge_smallest(self, n_reduce_to: int): labels[labels == from_topic] = to_topic return labels - def join_subtopics( - self, subtopics: Iterable[int], hierarchy: Optional[TopicNode] = None - ) -> TopicNode: - """Joins subtopics in a topic hierarchy and returns the joint TopicNode. - > Note that this method does not alter the underlying hierarchy! - > You will need to use the join() method of a hierarchy for that. - - Parameters - ---------- - subtopics: iterable of int - Indices of subtopics to be joint. - hierarchy: TopicNode, default None - Hierarchy to join subtopics in, defaults to the root hierarchy of the model. - - Returns - ------- - TopicNode - New topic made up of the joint subtopics. - """ - if hierarchy is None: - hierarchy = self.hierarchy - subtopics = list(set(subtopics)) - slot = min(subtopics) - max_subtopics = max(subtopics) - if len(self.children) < (max_subtopics - 1): - raise ValueError( - "These subtopics don't exist on the current node." - ) - if slot < 0: - raise ValueError( - "Outlier topics (-1) cannot be merged with other topics." - ) - if self.children is None: - raise ValueError( - "Current Node is a leaf, children can't be joined." - ) - path = (*hierarchy.path, slot) - children = [self.hierarchy[sub] for sub in subtopics] - doc_topic_vector = self.hierarchy.doc_topic_matrix[:, subtopics].sum( - axis=1 - ) - rest = [ - doc_topic_vector - for i_topic, doc_topic_vector in enumerate( - self.hierarchy.doc_topic_matrix.T - ) - if i_topic not in subtopics - ] - doc_topic_matrix = np.stack([doc_topic_vector, rest]).T - # TODO - pass - def reduce_topics( self, n_reduce_to: int, @@ -340,7 +286,6 @@ def reduce_topics( self.labels_ = self._merge_smallest(n_reduce_to) elif reduction_method == "agglomerative": self.labels_ = self._merge_agglomerative(n_reduce_to) - self.estimate_components(self.feature_importance) return self.labels_ def reset_reduction(self): @@ -381,10 +326,6 @@ def estimate_components( ) clusters = np.unique(self.labels_) self.classes_ = np.sort(clusters) - if -1 in self.classes_: - # Putting outliers in the last position, so that when you index things, - # it works. - self.classes_ = np.array([*self.classes_[1:], -1]) self.topic_sizes_ = np.array( [np.sum(self.labels_ == label) for label in self.classes_] )