From 5692b73c5f6d7b94e4a14433875935b17dd51475 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 2 Aug 2024 13:33:19 +0200 Subject: [PATCH] Added reduce_topics method to clustering models --- turftopic/models/cluster.py | 46 ++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py index 26bcc14..4954695 100644 --- a/turftopic/models/cluster.py +++ b/turftopic/models/cluster.py @@ -239,12 +239,56 @@ def _merge_smallest(self, n_reduce_to: int): labels[labels == from_topic] = to_topic return labels + def reduce_topics( + self, + n_reduce_to: int, + reduction_method: Literal["smallest", "agglomerative"], + ) -> np.ndarray: + """Reduces the clustering to the desired amount with the given method. + + Parameters + ---------- + n_reduce_to: int, default None + Number of topics to reduce topics to. + The specified reduction method will be used to merge them. + By default, topics are not merged. + reduction_method: 'agglomerative', 'smallest' + Method used to reduce the number of topics post-hoc. + When 'agglomerative', BERTopic's topic reduction method is used, + where topic vectors are hierarchically clustered. + When 'smallest', the smallest topic gets merged into the closest + non-outlier cluster until the desired number + is achieved similarly to Top2Vec. + + Returns + ------- + ndarray of shape (n_documents) + New cluster labels for documents. + """ + if reduction_method == "smallest": + self.labels_ = self._merge_smallest(n_reduce_to) + elif reduction_method == "agglomerative": + self.labels_ = self._merge_agglomerative(n_reduce_to) + return self.labels_ + def estimate_components( self, feature_importance: Literal[ - "centroid", "soft_ctf_idf", "bayes", "c-tf-idf" + "centroid", "soft-c-tf-idf", "bayes", "c-tf-idf" ], ) -> np.array: + """Estimates feature importances based on a fitted clustering. + + Parameters + ---------- + feature_importance: {'centroid', 'soft-c-tf-idf', 'bayes' 'c-tf-idf'} + Estimation method. + + Returns + ------- + ndarray of shape (n_components, n_vocab) + Topic-term matrix. + """ clusters = np.unique(self.labels_) self.classes_ = np.sort(clusters) self.topic_sizes_ = np.array(