From ffb51a29ae40e5082eba381a1a41d1d0a47e6c20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Fri, 2 Aug 2024 13:06:59 +0200
Subject: [PATCH 01/13] Added Bayes' rule feature importance to clustering
 models

---
 turftopic/feature_importance.py | 35 +++++++++++++++++++++++++++++++++
 turftopic/models/cluster.py     | 19 ++++++++++++++++--
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/turftopic/feature_importance.py b/turftopic/feature_importance.py
index 0657dd7..f329b79 100644
--- a/turftopic/feature_importance.py
+++ b/turftopic/feature_importance.py
@@ -1,6 +1,7 @@
 import numpy as np
 import scipy.sparse as spr
 from sklearn.metrics import pairwise_distances
+from sklearn.preprocessing import normalize
 
 
 def cluster_centroid_distance(
@@ -94,3 +95,37 @@ def ctf_idf(
         component = freq * np.log(1 + average / overall_freq)
         components.append(component)
     return np.stack(components)
+
+
+def bayes_rule(
+    doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
+) -> np.ndarray:
+    """Computes feature importance based on Bayes' rule.
+    The importance of a word for a topic is the probability of the topic conditional on the word.
+
+    $$p(t|w) = \\frac{p(w|t) * p(t)}{p(w)}$$
+
+    Parameters
+    ----------
+    doc_topic_matrix: np.ndarray
+        Document-topic matrix of shape (n_documents, n_topics)
+    doc_term_matrix: np.ndarray
+        Document-term matrix of shape (n_documents, vocab_size)
+
+    Returns
+    -------
+    ndarray of shape (n_topics, vocab_size)
+        Term importance matrix.
+    """
+    eps = np.finfo(float).eps
+    p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0)))
+    p_w = p_w / p_w.sum()
+    p_t = doc_topic_matrix.sum(axis=0)
+    p_t = p_t / p_t.sum()
+    term_importance = doc_topic_matrix.T @ doc_term_matrix
+    overall_in_topic = np.abs(term_importance).sum(axis=1)
+    p_wt = (term_importance.T / (overall_in_topic + eps)).T
+    p_wt = normalize(p_wt, norm="l1", axis=1)
+    p_tw = (p_wt.T * p_t).T / p_w
+    p_tw = normalize(p_tw, axis=0, norm="l1")
+    return p_tw
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
index 0318546..a31f802 100644
--- a/turftopic/models/cluster.py
+++ b/turftopic/models/cluster.py
@@ -14,6 +14,7 @@
 from turftopic.base import ContextualModel, Encoder
 from turftopic.dynamic import DynamicTopicModel
 from turftopic.feature_importance import (
+    bayes_rule,
     cluster_centroid_distance,
     ctf_idf,
     soft_ctf_idf,
@@ -156,7 +157,10 @@ def __init__(
         dimensionality_reduction: Optional[TransformerMixin] = None,
         clustering: Optional[ClusterMixin] = None,
         feature_importance: Literal[
-            "c-tf-idf", "soft-c-tf-idf", "centroid"
+            "c-tf-idf",
+            "soft-c-tf-idf",
+            "centroid",
+            "bayes",
         ] = "soft-c-tf-idf",
         n_reduce_to: Optional[int] = None,
         reduction_method: Literal[
@@ -166,7 +170,12 @@ def __init__(
     ):
         self.encoder = encoder
         self.random_state = random_state
-        if feature_importance not in ["c-tf-idf", "soft-c-tf-idf", "centroid"]:
+        if feature_importance not in [
+            "c-tf-idf",
+            "soft-c-tf-idf",
+            "centroid",
+            "bayes",
+        ]:
             raise ValueError(feature_message)
         if isinstance(encoder, int):
             raise TypeError(integer_message)
@@ -256,6 +265,10 @@ def _estimate_parameters(
                 self.vocab_embeddings,
                 metric="cosine",
             )
+        elif self.feature_importance == "bayes":
+            self.components_ = bayes_rule(
+                document_topic_matrix, doc_term_matrix
+            )
         else:
             self.components_ = ctf_idf(document_topic_matrix, doc_term_matrix)
 
@@ -368,6 +381,8 @@ def fit_transform_dynamic(
                     )
                 elif self.feature_importance == "c-tf-idf":
                     components = ctf_idf(t_doc_topic_matrix, t_doc_term_matrix)
+            elif self.feature_importance == "bayes":
+                components = bayes_rule(t_doc_topic_matrix, t_doc_term_matrix)
             elif self.feature_importance == "centroid":
                 time_index = time_labels == i_timebin
                 t_topic_vectors = calculate_topic_vectors(

From e6b721a76c833ae5d6a4b856e8bf30eb4e060703 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Fri, 2 Aug 2024 13:25:13 +0200
Subject: [PATCH 02/13] Made clustering models' feature importance estimation
 more efficient

---
 turftopic/models/cluster.py | 44 +++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
index a31f802..26bcc14 100644
--- a/turftopic/models/cluster.py
+++ b/turftopic/models/cluster.py
@@ -12,6 +12,7 @@
 from sklearn.preprocessing import label_binarize
 
 from turftopic.base import ContextualModel, Encoder
+from turftopic.data import TopicData
 from turftopic.dynamic import DynamicTopicModel
 from turftopic.feature_importance import (
     bayes_rule,
@@ -238,28 +239,32 @@ def _merge_smallest(self, n_reduce_to: int):
             labels[labels == from_topic] = to_topic
         return labels
 
-    def _estimate_parameters(
+    def estimate_components(
         self,
-        embeddings: np.ndarray,
-        doc_term_matrix: np.ndarray,
-    ):
+        feature_importance: Literal[
+            "centroid", "soft_ctf_idf", "bayes", "c-tf-idf"
+        ],
+    ) -> np.array:
         clusters = np.unique(self.labels_)
         self.classes_ = np.sort(clusters)
         self.topic_sizes_ = np.array(
             [np.sum(self.labels_ == label) for label in self.classes_]
         )
-        self.topic_vectors_ = calculate_topic_vectors(self.labels_, embeddings)
-        self.vocab_embeddings = self.encoder_.encode(
-            self.vectorizer.get_feature_names_out()
-        )  # type: ignore
+        self.topic_vectors_ = calculate_topic_vectors(
+            self.labels_, self.embeddings
+        )
         document_topic_matrix = label_binarize(
             self.labels_, classes=self.classes_
         )
         if self.feature_importance == "soft-c-tf-idf":
             self.components_ = soft_ctf_idf(
-                document_topic_matrix, doc_term_matrix
+                document_topic_matrix, self.doc_term_matrix
             )  # type: ignore
         elif self.feature_importance == "centroid":
+            if not hasattr(self, "vocab_embeddings"):
+                self.vocab_embeddings = self.encoder_.encode(
+                    self.vectorizer.get_feature_names_out()
+                )  # type: ignore
             self.components_ = cluster_centroid_distance(
                 self.topic_vectors_,
                 self.vocab_embeddings,
@@ -267,10 +272,13 @@ def _estimate_parameters(
             )
         elif self.feature_importance == "bayes":
             self.components_ = bayes_rule(
-                document_topic_matrix, doc_term_matrix
+                document_topic_matrix, self.doc_term_matrix
             )
         else:
-            self.components_ = ctf_idf(document_topic_matrix, doc_term_matrix)
+            self.components_ = ctf_idf(
+                document_topic_matrix, self.doc_term_matrix
+            )
+        return self.components_
 
     def fit_predict(
         self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None
@@ -296,6 +304,7 @@ def fit_predict(
             if embeddings is None:
                 status.update("Encoding documents")
                 embeddings = self.encoder_.encode(raw_documents)
+                self.embeddings = embeddings
                 console.log("Encoding done.")
             status.update("Extracting terms")
             self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents)
@@ -309,10 +318,7 @@ def fit_predict(
             self.labels_ = self.clustering.fit_predict(reduced_embeddings)
             console.log("Clustering done.")
             status.update("Estimating parameters.")
-            self._estimate_parameters(
-                embeddings,
-                self.doc_term_matrix,
-            )
+            self.estimate_components(self.feature_importance)
             console.log("Parameter estimation done.")
             if self.n_reduce_to is not None:
                 n_topics = self.classes_.shape[0]
@@ -327,12 +333,12 @@ def fit_predict(
                     f"Topic reduction done from {n_topics} to {self.n_reduce_to}."
                 )
                 status.update("Reestimating parameters.")
-                self._estimate_parameters(
-                    embeddings,
-                    self.doc_term_matrix,
-                )
+                self.estimate_components(self.feature_importance)
                 console.log("Reestimation done.")
         console.log("Model fitting done.")
+        self.doc_term_matrix = label_binarize(
+            self.labels_, classes=self.classes_
+        )
         return self.labels_
 
     def fit_transform(

From 5692b73c5f6d7b94e4a14433875935b17dd51475 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Fri, 2 Aug 2024 13:33:19 +0200
Subject: [PATCH 03/13] Added reduce_topics method to clustering models

---
 turftopic/models/cluster.py | 46 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
index 26bcc14..4954695 100644
--- a/turftopic/models/cluster.py
+++ b/turftopic/models/cluster.py
@@ -239,12 +239,56 @@ def _merge_smallest(self, n_reduce_to: int):
             labels[labels == from_topic] = to_topic
         return labels
 
+    def reduce_topics(
+        self,
+        n_reduce_to: int,
+        reduction_method: Literal["smallest", "agglomerative"],
+    ) -> np.ndarray:
+        """Reduces the clustering to the desired amount with the given method.
+
+        Parameters
+        ----------
+        n_reduce_to: int, default None
+            Number of topics to reduce topics to.
+            The specified reduction method will be used to merge them.
+            By default, topics are not merged.
+        reduction_method: 'agglomerative', 'smallest'
+            Method used to reduce the number of topics post-hoc.
+            When 'agglomerative', BERTopic's topic reduction method is used,
+            where topic vectors are hierarchically clustered.
+            When 'smallest', the smallest topic gets merged into the closest
+            non-outlier cluster until the desired number
+            is achieved similarly to Top2Vec.
+
+        Returns
+        -------
+        ndarray of shape (n_documents)
+            New cluster labels for documents.
+        """
+        if reduction_method == "smallest":
+            self.labels_ = self._merge_smallest(n_reduce_to)
+        elif reduction_method == "agglomerative":
+            self.labels_ = self._merge_agglomerative(n_reduce_to)
+        return self.labels_
+
     def estimate_components(
         self,
         feature_importance: Literal[
-            "centroid", "soft_ctf_idf", "bayes", "c-tf-idf"
+            "centroid", "soft-c-tf-idf", "bayes", "c-tf-idf"
         ],
     ) -> np.array:
+        """Estimates feature importances based on a fitted clustering.
+
+        Parameters
+        ----------
+        feature_importance: {'centroid', 'soft-c-tf-idf', 'bayes' 'c-tf-idf'}
+            Estimation method.
+
+        Returns
+        -------
+        ndarray of shape (n_components, n_vocab)
+            Topic-term matrix.
+        """
         clusters = np.unique(self.labels_)
         self.classes_ = np.sort(clusters)
         self.topic_sizes_ = np.array(

From 963c89f3b689baae6941213912969e2758cd6d52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Fri, 2 Aug 2024 13:36:02 +0200
Subject: [PATCH 04/13] Fixed docstrings for feature_importance

---
 turftopic/models/cluster.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
index 4954695..b6bb3ee 100644
--- a/turftopic/models/cluster.py
+++ b/turftopic/models/cluster.py
@@ -127,13 +127,14 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel):
         Clustering method to use for finding topics.
         Defaults to OPTICS with 25 minimum cluster size.
         To imitate the behavior of BERTopic or Top2Vec you should use HDBSCAN.
-    feature_importance: 'soft-c-tf-idf', 'c-tf-idf' or 'centroid', default 'soft-c-tf-idf'
+    feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf'
         Method for estimating term importances.
         'centroid' uses distances from cluster centroid similarly
         to Top2Vec.
         'c-tf-idf' uses BERTopic's c-tf-idf.
         'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
         be very similar to 'c-tf-idf'.
+        'bayes' uses Bayes' rule.
     n_reduce_to: int, default None
         Number of topics to reduce topics to.
         The specified reduction method will be used to merge them.
@@ -281,8 +282,14 @@ def estimate_components(
 
         Parameters
         ----------
-        feature_importance: {'centroid', 'soft-c-tf-idf', 'bayes' 'c-tf-idf'}
-            Estimation method.
+        feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf'
+            Method for estimating term importances.
+            'centroid' uses distances from cluster centroid similarly
+            to Top2Vec.
+            'c-tf-idf' uses BERTopic's c-tf-idf.
+            'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
+            be very similar to 'c-tf-idf'.
+            'bayes' uses Bayes' rule.
 
         Returns
         -------

From a9bb21c16a12fee18c0fdc0c5a2f4cb0fbe619f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Fri, 2 Aug 2024 13:43:09 +0200
Subject: [PATCH 05/13] Added option for resetting topic reduction

---
 turftopic/models/cluster.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
index b6bb3ee..d6fae2d 100644
--- a/turftopic/models/cluster.py
+++ b/turftopic/models/cluster.py
@@ -1,3 +1,4 @@
+import warnings
 from datetime import datetime
 from typing import Literal, Optional, Union
 
@@ -12,7 +13,6 @@
 from sklearn.preprocessing import label_binarize
 
 from turftopic.base import ContextualModel, Encoder
-from turftopic.data import TopicData
 from turftopic.dynamic import DynamicTopicModel
 from turftopic.feature_importance import (
     bayes_rule,
@@ -266,12 +266,21 @@ def reduce_topics(
         ndarray of shape (n_documents)
             New cluster labels for documents.
         """
+        if not hasattr(self, "original_labels_"):
+            self.original_labels_ = self.labels_
         if reduction_method == "smallest":
             self.labels_ = self._merge_smallest(n_reduce_to)
         elif reduction_method == "agglomerative":
             self.labels_ = self._merge_agglomerative(n_reduce_to)
         return self.labels_
 
+    def reset_reduction(self):
+        if not hasattr(self, "original_labels_"):
+            warnings.warn("Topics have never been reduced, nothing to reset.")
+        else:
+            self.labels_ = self.original_labels_
+            self.estimate_components(self.feature_importance)
+
     def estimate_components(
         self,
         feature_importance: Literal[
@@ -376,10 +385,7 @@ def fit_predict(
                 status.update(
                     f"Reducing topics from {n_topics} to {self.n_reduce_to}"
                 )
-                if self.reduction_method == "agglomerative":
-                    self.labels_ = self._merge_agglomerative(self.n_reduce_to)
-                else:
-                    self.labels_ = self._merge_smallest(self.n_reduce_to)
+                self.reduce_topics(self.n_reduce_to, self.reduction_method)
                 console.log(
                     f"Topic reduction done from {n_topics} to {self.n_reduce_to}."
                 )

From 76789f31158a62ff943f3e74817b4bbf9b071312 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Fri, 2 Aug 2024 13:43:40 +0200
Subject: [PATCH 06/13] Added docstring to reset_reduction

---
 turftopic/models/cluster.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
index d6fae2d..814ecb5 100644
--- a/turftopic/models/cluster.py
+++ b/turftopic/models/cluster.py
@@ -275,6 +275,7 @@ def reduce_topics(
         return self.labels_
 
     def reset_reduction(self):
+        """Resets topic reductions to the original clustering."""
         if not hasattr(self, "original_labels_"):
             warnings.warn("Topics have never been reduced, nothing to reset.")
         else:

From 7d98e12ee3345bb1a0ebbb9271bd310c3cda0f09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Fri, 2 Aug 2024 13:44:47 +0200
Subject: [PATCH 07/13] Version bump

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 384d349..ac0d284 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ line-length=79
 
 [tool.poetry]
 name = "turftopic"
-version = "0.5.0"
+version = "0.5.1"
 description = "Topic modeling with contextual representations from sentence transformers."
 authors = ["Márton Kardos <power.up1163@gmail.com>"]
 license = "MIT"

From 10d27114e163225dcc38a681af2a221e727dcbe3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Fri, 2 Aug 2024 13:46:57 +0200
Subject: [PATCH 08/13] Fixed bug with embeddings

---
 turftopic/models/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
index 814ecb5..95629a3 100644
--- a/turftopic/models/cluster.py
+++ b/turftopic/models/cluster.py
@@ -365,8 +365,8 @@ def fit_predict(
             if embeddings is None:
                 status.update("Encoding documents")
                 embeddings = self.encoder_.encode(raw_documents)
-                self.embeddings = embeddings
                 console.log("Encoding done.")
+            self.embeddings = embeddings
             status.update("Extracting terms")
             self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents)
             console.log("Term extraction done.")

From 660a91f520dd1b30a24db82dd7048b932a0dbb4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Mon, 5 Aug 2024 10:01:14 +0200
Subject: [PATCH 09/13] Made term importance estimation more immune to missing
 values

---
 turftopic/feature_importance.py |  33 +++++----
 turftopic/models/cluster.py     | 115 ++++++++++++++++++++------------
 2 files changed, 87 insertions(+), 61 deletions(-)

diff --git a/turftopic/feature_importance.py b/turftopic/feature_importance.py
index f329b79..ea1fb32 100644
--- a/turftopic/feature_importance.py
+++ b/turftopic/feature_importance.py
@@ -1,13 +1,11 @@
 import numpy as np
 import scipy.sparse as spr
-from sklearn.metrics import pairwise_distances
-from sklearn.preprocessing import normalize
+from sklearn.metrics.pairwise import cosine_similarity
 
 
 def cluster_centroid_distance(
     cluster_centroids: np.ndarray,
     vocab_embeddings: np.ndarray,
-    metric="cosine",
 ) -> np.ndarray:
     """Computes feature importances based on distances between
     topic vectors (cluster centroids) and term embeddings
@@ -18,25 +16,21 @@ def cluster_centroid_distance(
         Coordinates of cluster centroids of shape (n_topics, embedding_size)
     vocab_embeddings: np.ndarray
         Term embeddings of shape (vocab_size, embedding_size)
-    metric: str, defaul 'cosine'
-        Metric used to compute distance from centroid.
-        See documentation for sklearn.metrics.pairwise.distance_metrics
-        for valid values.
 
     Returns
     -------
     ndarray of shape (n_topics, vocab_size)
         Term importance matrix.
     """
-    distances = pairwise_distances(
-        cluster_centroids, vocab_embeddings, metric=metric
+    n_components = cluster_centroids.shape[0]
+    n_vocab = vocab_embeddings.shape[0]
+    components = np.full((n_components, n_vocab), np.nan)
+    valid_centroids = np.all(np.isfinite(cluster_centroids), axis=1)
+    similarities = cosine_similarity(
+        cluster_centroids[valid_centroids], vocab_embeddings
     )
-    similarities = -distances / np.max(distances)
-    # Z-score transformation
-    similarities = (similarities - np.mean(similarities)) / np.std(
-        similarities
-    )
-    return similarities
+    components[valid_centroids, :] = similarities
+    return components
 
 
 def soft_ctf_idf(
@@ -88,6 +82,7 @@ def ctf_idf(
     components = []
     overall_freq = np.ravel(np.asarray(doc_term_matrix.sum(axis=0)))
     average = overall_freq.sum() / n_topics
+    overall_freq[overall_freq == 0] = np.finfo(float).eps
     for i_topic in range(n_topics):
         freq = np.ravel(
             np.asarray(doc_term_matrix[labels == i_topic].sum(axis=0))
@@ -120,12 +115,14 @@ def bayes_rule(
     eps = np.finfo(float).eps
     p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0)))
     p_w = p_w / p_w.sum()
+    p_w[p_w <= 0] = eps
     p_t = doc_topic_matrix.sum(axis=0)
     p_t = p_t / p_t.sum()
     term_importance = doc_topic_matrix.T @ doc_term_matrix
     overall_in_topic = np.abs(term_importance).sum(axis=1)
-    p_wt = (term_importance.T / (overall_in_topic + eps)).T
-    p_wt = normalize(p_wt, norm="l1", axis=1)
+    overall_in_topic[overall_in_topic <= 0] = eps
+    p_wt = (term_importance.T / (overall_in_topic)).T
+    p_wt /= p_wt.sum(axis=1)[:, None]
     p_tw = (p_wt.T * p_t).T / p_w
-    p_tw = normalize(p_tw, axis=0, norm="l1")
+    p_tw /= np.nansum(p_tw, axis=0)
     return p_tw
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
index 95629a3..3cd7eb3 100644
--- a/turftopic/models/cluster.py
+++ b/turftopic/models/cluster.py
@@ -203,6 +203,22 @@ def __init__(
         self.n_reduce_to = n_reduce_to
         self.reduction_method = reduction_method
 
+    def _calculate_topic_vectors(
+        self, is_in_slice: Optional[np.ndarray] = None
+    ) -> np.ndarray:
+        label_to_idx = {label: idx for idx, label in enumerate(self.classes_)}
+        n_topics = len(self.classes_)
+        n_dims = self.embeddings.shape[1]
+        topic_vectors = np.full((n_topics, n_dims), np.nan)
+        for label in np.unique(self.labels_):
+            doc_idx = self.labels_ == label
+            if is_in_slice is not None:
+                doc_idx = doc_idx & is_in_slice
+            topic_vectors[label_to_idx[label], :] = np.mean(
+                self.embeddings[doc_idx], axis=0
+            )
+        return topic_vectors
+
     def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray:
         n_topics = self.components_.shape[0]
         res = {old_label: old_label for old_label in self.classes_}
@@ -311,9 +327,7 @@ def estimate_components(
         self.topic_sizes_ = np.array(
             [np.sum(self.labels_ == label) for label in self.classes_]
         )
-        self.topic_vectors_ = calculate_topic_vectors(
-            self.labels_, self.embeddings
-        )
+        self.topic_vectors_ = self._calculate_topic_vectors()
         document_topic_matrix = label_binarize(
             self.labels_, classes=self.classes_
         )
@@ -329,7 +343,6 @@ def estimate_components(
             self.components_ = cluster_centroid_distance(
                 self.topic_vectors_,
                 self.vocab_embeddings,
-                metric="cosine",
             )
         elif self.feature_importance == "bayes":
             self.components_ = bayes_rule(
@@ -394,7 +407,7 @@ def fit_predict(
                 self.estimate_components(self.feature_importance)
                 console.log("Reestimation done.")
         console.log("Model fitting done.")
-        self.doc_term_matrix = label_binarize(
+        self.doc_topic_matrix = label_binarize(
             self.labels_, classes=self.classes_
         )
         return self.labels_
@@ -405,6 +418,56 @@ def fit_transform(
         labels = self.fit_predict(raw_documents, y, embeddings)
         return label_binarize(labels, classes=self.classes_)
 
+    def estimate_temporal_components(
+        self,
+        time_labels,
+        time_bin_edges,
+        feature_importance: Literal[
+            "c-tf-idf", "soft-c-tf-idf", "centroid", "bayes"
+        ],
+    ):
+        n_comp, n_vocab = self.components_.shape
+        n_bins = len(self.time_bin_edges) - 1
+        self.temporal_components_ = np.full(
+            (n_bins, n_comp, n_vocab),
+            np.nan,
+            dtype=self.components_.dtype,
+        )
+        self.temporal_importance_ = np.zeros((n_bins, n_comp))
+        for i_timebin in np.unique(time_labels):
+            topic_importances = self.doc_topic_matrix[
+                time_labels == i_timebin
+            ].sum(axis=0)
+            if not topic_importances.sum() == 0:
+                topic_importances = topic_importances / topic_importances.sum()
+            self.temporal_importance_[i_timebin, :] = topic_importances
+            t_dtm = self.doc_term_matrix[time_labels == i_timebin]
+            t_doc_topic = self.doc_topic_matrix[time_labels == i_timebin]
+            if feature_importance == "c-tf-idf":
+                self.temporal_components_[i_timebin] = ctf_idf(
+                    t_doc_topic, t_dtm
+                )
+            elif feature_importance == "soft-c-tf-idf":
+                self.temporal_components_[i_timebin] = soft_ctf_idf(
+                    t_doc_topic, t_dtm
+                )
+            elif feature_importance == "bayes":
+                self.temporal_components_[i_timebin] = bayes_rule(
+                    t_doc_topic, t_dtm
+                )
+            elif feature_importance == "centroid":
+                t_topic_vectors = self._calculate_topic_vectors(
+                    time_labels == i_timebin,
+                )
+                components = cluster_centroid_distance(
+                    t_topic_vectors,
+                    self.vocab_embeddings,
+                )
+                mask_terms = t_dtm.sum(axis=0).astype(np.float64)
+                mask_terms = np.squeeze(np.asarray(mask_terms))
+                components[:, mask_terms == 0] = np.nan
+                self.temporal_components_[i_timebin] = components
+
     def fit_transform_dynamic(
         self,
         raw_documents,
@@ -431,42 +494,8 @@ def fit_transform_dynamic(
         self.temporal_importance_ = np.zeros((n_bins, n_comp))
         if embeddings is None:
             embeddings = self.encoder_.encode(raw_documents)
-        for i_timebin in np.unique(time_labels):
-            topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(
-                axis=0
-            )
-            topic_importances = topic_importances / topic_importances.sum()
-            t_doc_term_matrix = self.doc_term_matrix[time_labels == i_timebin]
-            t_doc_topic_matrix = doc_topic_matrix[time_labels == i_timebin]
-            if "c-tf-idf" in self.feature_importance:
-                if self.feature_importance == "soft-c-tf-idf":
-                    components = soft_ctf_idf(
-                        t_doc_topic_matrix, t_doc_term_matrix
-                    )
-                elif self.feature_importance == "c-tf-idf":
-                    components = ctf_idf(t_doc_topic_matrix, t_doc_term_matrix)
-            elif self.feature_importance == "bayes":
-                components = bayes_rule(t_doc_topic_matrix, t_doc_term_matrix)
-            elif self.feature_importance == "centroid":
-                time_index = time_labels == i_timebin
-                t_topic_vectors = calculate_topic_vectors(
-                    self.labels_,
-                    embeddings,
-                    time_index,
-                )
-                topic_mask = np.isnan(t_topic_vectors).all(
-                    axis=1, keepdims=True
-                )
-                t_topic_vectors[:] = 0
-                components = cluster_centroid_distance(
-                    t_topic_vectors,
-                    self.vocab_embeddings,
-                    metric="cosine",
-                )
-                components *= topic_mask
-                mask_terms = t_doc_term_matrix.sum(axis=0).astype(np.float64)
-                mask_terms[mask_terms == 0] = np.nan
-                components *= mask_terms
-            self.temporal_components_[i_timebin] = components
-            self.temporal_importance_[i_timebin] = topic_importances
+        self.embeddings = embeddings
+        self.estimate_temporal_components(
+            time_labels, self.time_bin_edges, self.feature_importance
+        )
         return doc_topic_matrix

From 7ee21fdb1e3452528cb7a80f1a226ad1db00f814 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Mon, 5 Aug 2024 10:11:47 +0200
Subject: [PATCH 10/13] Fixed estimate_components

---
 turftopic/models/cluster.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
index 3cd7eb3..dfbe262 100644
--- a/turftopic/models/cluster.py
+++ b/turftopic/models/cluster.py
@@ -14,12 +14,9 @@
 
 from turftopic.base import ContextualModel, Encoder
 from turftopic.dynamic import DynamicTopicModel
-from turftopic.feature_importance import (
-    bayes_rule,
-    cluster_centroid_distance,
-    ctf_idf,
-    soft_ctf_idf,
-)
+from turftopic.feature_importance import (bayes_rule,
+                                          cluster_centroid_distance, ctf_idf,
+                                          soft_ctf_idf)
 from turftopic.vectorizer import default_vectorizer
 
 integer_message = """
@@ -331,11 +328,11 @@ def estimate_components(
         document_topic_matrix = label_binarize(
             self.labels_, classes=self.classes_
         )
-        if self.feature_importance == "soft-c-tf-idf":
+        if feature_importance == "soft-c-tf-idf":
             self.components_ = soft_ctf_idf(
                 document_topic_matrix, self.doc_term_matrix
             )  # type: ignore
-        elif self.feature_importance == "centroid":
+        elif feature_importance == "centroid":
             if not hasattr(self, "vocab_embeddings"):
                 self.vocab_embeddings = self.encoder_.encode(
                     self.vectorizer.get_feature_names_out()
@@ -344,7 +341,7 @@ def estimate_components(
                 self.topic_vectors_,
                 self.vocab_embeddings,
             )
-        elif self.feature_importance == "bayes":
+        elif feature_importance == "bayes":
             self.components_ = bayes_rule(
                 document_topic_matrix, self.doc_term_matrix
             )

From b8688e1b5334ebc52a3110f79a35ccc2bfdc848c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Mon, 19 Aug 2024 16:14:02 +0200
Subject: [PATCH 11/13] Added docstrings and meaningful error messages to
 estimate_components

---
 turftopic/models/cluster.py | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
index dfbe262..b69181c 100644
--- a/turftopic/models/cluster.py
+++ b/turftopic/models/cluster.py
@@ -7,6 +7,7 @@
 from sentence_transformers import SentenceTransformer
 from sklearn.base import ClusterMixin, TransformerMixin
 from sklearn.cluster import OPTICS, AgglomerativeClustering
+from sklearn.exceptions import NotFittedError
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.manifold import TSNE
 from sklearn.metrics.pairwise import cosine_distances
@@ -300,7 +301,7 @@ def estimate_components(
         feature_importance: Literal[
             "centroid", "soft-c-tf-idf", "bayes", "c-tf-idf"
         ],
-    ) -> np.array:
+    ) -> np.ndarray:
         """Estimates feature importances based on a fitted clustering.
 
         Parameters
@@ -319,6 +320,10 @@ def estimate_components(
         ndarray of shape (n_components, n_vocab)
             Topic-term matrix.
         """
+        if getattr(self, "labels_", None) is None:
+            raise NotFittedError(
+                "The model has not been fitted yet, please fit the model before estimating temporal components."
+            )
         clusters = np.unique(self.labels_)
         self.classes_ = np.sort(clusters)
         self.topic_sizes_ = np.array(
@@ -422,8 +427,31 @@ def estimate_temporal_components(
         feature_importance: Literal[
             "c-tf-idf", "soft-c-tf-idf", "centroid", "bayes"
         ],
-    ):
+    ) -> np.ndarray:
+        """Estimates temporal components based on a fitted topic model.
+
+        Parameters
+        ----------
+        feature_importance: {'soft-c-tf-idf', 'c-tf-idf', 'bayes', 'centroid'}, default 'soft-c-tf-idf'
+            Method for estimating term importances.
+            'centroid' uses distances from cluster centroid similarly
+            to Top2Vec.
+            'c-tf-idf' uses BERTopic's c-tf-idf.
+            'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
+            be very similar to 'c-tf-idf'.
+            'bayes' uses Bayes' rule.
+
+        Returns
+        -------
+        ndarray of shape (n_time_bins, n_components, n_vocab)
+            Temporal topic-term matrix.
+        """
+        if getattr(self, "components_", None) is None:
+            raise NotFittedError(
+                "The model has not been fitted yet, please fit the model before estimating temporal components."
+            )
         n_comp, n_vocab = self.components_.shape
+        self.time_bin_edges = time_bin_edges
         n_bins = len(self.time_bin_edges) - 1
         self.temporal_components_ = np.full(
             (n_bins, n_comp, n_vocab),
@@ -464,6 +492,7 @@ def estimate_temporal_components(
                 mask_terms = np.squeeze(np.asarray(mask_terms))
                 components[:, mask_terms == 0] = np.nan
                 self.temporal_components_[i_timebin] = components
+        return self.temporal_components_
 
     def fit_transform_dynamic(
         self,

From c6ac90b5f5745d305dbebaa3fbc33e8d00e6b589 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Tue, 27 Aug 2024 10:31:39 +0200
Subject: [PATCH 12/13] Started implementing hierarchical topic joining in
 clustering models

---
 turftopic/hierarchical.py   | 48 +++++++++++++++++++++++++++
 turftopic/models/cluster.py | 65 +++++++++++++++++++++++++++++++++++--
 2 files changed, 110 insertions(+), 3 deletions(-)

diff --git a/turftopic/hierarchical.py b/turftopic/hierarchical.py
index d0a5144..f40f56f 100644
--- a/turftopic/hierarchical.py
+++ b/turftopic/hierarchical.py
@@ -116,6 +116,20 @@ class TopicNode:
     document_topic_vector: Optional[np.ndarray] = None
     children: Optional[list[TopicNode]] = None
 
+    @property
+    def components_(self) -> np.ndarray:
+        if self.children is None:
+            raise ValueError("Current node is a leaf, no components.")
+        return np.stack([child.word_importance for child in self.children])
+
+    @property
+    def doc_topic_matrix(self) -> np.ndarray:
+        if self.children is None:
+            raise ValueError("Current node is a leaf, no doc_topic_matrix.")
+        return np.stack(
+            [child.document_topic_vector for child in self.children]
+        ).T
+
     @classmethod
     def create_root(
         cls,
@@ -146,6 +160,14 @@ def create_root(
             children=children,
         )
 
+    def set_path(self, path: tuple[int]):
+        """Sets path for current node and all children accordingly."""
+        self.path = path
+        if self.children is None:
+            return
+        for i_child, child in enumerate(self.children):
+            child.set_path((*self.path, i_child))
+
     @property
     def level(self) -> int:
         """Indicates how deep down the hierarchy the topic is."""
@@ -275,3 +297,29 @@ def divide_children(self, n_subtopics: int, **kwargs):
     def plot_tree(self):
         """Plots hierarchy as an interactive tree in Plotly."""
         return _tree_plot(self)
+
+    def join(self, *subtopics: int, **kwargs):
+        slot = min(subtopics)
+        max_subtopics = max(subtopics)
+        if len(self.children) < (max_subtopics - 1):
+            raise ValueError(
+                "These subtopics don't exist on the current node."
+            )
+        if slot < 0:
+            raise ValueError(
+                "Outlier topics (-1) cannot be merged with other topics."
+            )
+        if self.children is None:
+            raise ValueError(
+                "Current Node is a leaf, children can't be joined."
+            )
+        try:
+            self.children[slot] = self.model.join_subtopics(
+                subtopics, self, **kwargs
+            )
+            self.set_path(self.path)
+        except AttributeError as e:
+            raise AttributeError(
+                "Looks like your model is not an agglomerative hierarchical model."
+            ) from e
+        return self
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
index b69181c..fa74b0b 100644
--- a/turftopic/models/cluster.py
+++ b/turftopic/models/cluster.py
@@ -1,6 +1,6 @@
 import warnings
 from datetime import datetime
-from typing import Literal, Optional, Union
+from typing import Iterable, Literal, Optional, Union
 
 import numpy as np
 from rich.console import Console
@@ -18,6 +18,7 @@
 from turftopic.feature_importance import (bayes_rule,
                                           cluster_centroid_distance, ctf_idf,
                                           soft_ctf_idf)
+from turftopic.hierarchical import TopicNode
 from turftopic.vectorizer import default_vectorizer
 
 integer_message = """
@@ -230,11 +231,12 @@ def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray:
             ]
         )
         old_labels = [label for label in self.classes_ if label != -1]
-        new_labels = AgglomerativeClustering(
+        clustering = AgglomerativeClustering(
             n_clusters=n_reduce_to,
             metric="cosine",
             linkage="average",
-        ).fit_predict(interesting_topic_vectors)
+        )
+        new_labels = clustering.fit_predict(interesting_topic_vectors)
         res = {}
         if -1 in self.classes_:
             res[-1] = -1
@@ -254,6 +256,58 @@ def _merge_smallest(self, n_reduce_to: int):
             labels[labels == from_topic] = to_topic
         return labels
 
+    def join_subtopics(
+        self, subtopics: Iterable[int], hierarchy: Optional[TopicNode] = None
+    ) -> TopicNode:
+        """Joins subtopics in a topic hierarchy and returns the joint TopicNode.
+        > Note that this method does not alter the underlying hierarchy!
+        > You will need to use the join() method of a hierarchy for that.
+
+        Parameters
+        ----------
+        subtopics: iterable of int
+            Indices of subtopics to be joint.
+        hierarchy: TopicNode, default None
+            Hierarchy to join subtopics in, defaults to the root hierarchy of the model.
+
+        Returns
+        -------
+        TopicNode
+            New topic made up of the joint subtopics.
+        """
+        if hierarchy is None:
+            hierarchy = self.hierarchy
+        subtopics = list(set(subtopics))
+        slot = min(subtopics)
+        max_subtopics = max(subtopics)
+        if len(self.children) < (max_subtopics - 1):
+            raise ValueError(
+                "These subtopics don't exist on the current node."
+            )
+        if slot < 0:
+            raise ValueError(
+                "Outlier topics (-1) cannot be merged with other topics."
+            )
+        if self.children is None:
+            raise ValueError(
+                "Current Node is a leaf, children can't be joined."
+            )
+        path = (*hierarchy.path, slot)
+        children = [self.hierarchy[sub] for sub in subtopics]
+        doc_topic_vector = self.hierarchy.doc_topic_matrix[:, subtopics].sum(
+            axis=1
+        )
+        rest = [
+            doc_topic_vector
+            for i_topic, doc_topic_vector in enumerate(
+                self.hierarchy.doc_topic_matrix.T
+            )
+            if i_topic not in subtopics
+        ]
+        doc_topic_matrix = np.stack([doc_topic_vector, rest]).T
+        # TODO
+        pass
+
     def reduce_topics(
         self,
         n_reduce_to: int,
@@ -286,6 +340,7 @@ def reduce_topics(
             self.labels_ = self._merge_smallest(n_reduce_to)
         elif reduction_method == "agglomerative":
             self.labels_ = self._merge_agglomerative(n_reduce_to)
+        self.estimate_components(self.feature_importance)
         return self.labels_
 
     def reset_reduction(self):
@@ -326,6 +381,10 @@ def estimate_components(
             )
         clusters = np.unique(self.labels_)
         self.classes_ = np.sort(clusters)
+        if -1 in self.classes_:
+            # Putting outliers in the last position, so that when you index things,
+            # it works.
+            self.classes_ = np.array([*self.classes_[1:], -1])
         self.topic_sizes_ = np.array(
             [np.sum(self.labels_ == label) for label in self.classes_]
         )

From ef4c28d420adc09f36254321dc0cc0ab369cc4de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1rton=20Kardos?= <power.up1163@gmail.com>
Date: Mon, 21 Oct 2024 16:10:43 +0200
Subject: [PATCH 13/13] Revert "Started implementing hierarchical topic joining
 in clustering models"

This reverts commit c6ac90b5f5745d305dbebaa3fbc33e8d00e6b589.
---
 turftopic/hierarchical.py   | 48 ---------------------------
 turftopic/models/cluster.py | 65 ++-----------------------------------
 2 files changed, 3 insertions(+), 110 deletions(-)

diff --git a/turftopic/hierarchical.py b/turftopic/hierarchical.py
index f40f56f..d0a5144 100644
--- a/turftopic/hierarchical.py
+++ b/turftopic/hierarchical.py
@@ -116,20 +116,6 @@ class TopicNode:
     document_topic_vector: Optional[np.ndarray] = None
     children: Optional[list[TopicNode]] = None
 
-    @property
-    def components_(self) -> np.ndarray:
-        if self.children is None:
-            raise ValueError("Current node is a leaf, no components.")
-        return np.stack([child.word_importance for child in self.children])
-
-    @property
-    def doc_topic_matrix(self) -> np.ndarray:
-        if self.children is None:
-            raise ValueError("Current node is a leaf, no doc_topic_matrix.")
-        return np.stack(
-            [child.document_topic_vector for child in self.children]
-        ).T
-
     @classmethod
     def create_root(
         cls,
@@ -160,14 +146,6 @@ def create_root(
             children=children,
         )
 
-    def set_path(self, path: tuple[int]):
-        """Sets path for current node and all children accordingly."""
-        self.path = path
-        if self.children is None:
-            return
-        for i_child, child in enumerate(self.children):
-            child.set_path((*self.path, i_child))
-
     @property
     def level(self) -> int:
         """Indicates how deep down the hierarchy the topic is."""
@@ -297,29 +275,3 @@ def divide_children(self, n_subtopics: int, **kwargs):
     def plot_tree(self):
         """Plots hierarchy as an interactive tree in Plotly."""
         return _tree_plot(self)
-
-    def join(self, *subtopics: int, **kwargs):
-        slot = min(subtopics)
-        max_subtopics = max(subtopics)
-        if len(self.children) < (max_subtopics - 1):
-            raise ValueError(
-                "These subtopics don't exist on the current node."
-            )
-        if slot < 0:
-            raise ValueError(
-                "Outlier topics (-1) cannot be merged with other topics."
-            )
-        if self.children is None:
-            raise ValueError(
-                "Current Node is a leaf, children can't be joined."
-            )
-        try:
-            self.children[slot] = self.model.join_subtopics(
-                subtopics, self, **kwargs
-            )
-            self.set_path(self.path)
-        except AttributeError as e:
-            raise AttributeError(
-                "Looks like your model is not an agglomerative hierarchical model."
-            ) from e
-        return self
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
index fa74b0b..b69181c 100644
--- a/turftopic/models/cluster.py
+++ b/turftopic/models/cluster.py
@@ -1,6 +1,6 @@
 import warnings
 from datetime import datetime
-from typing import Iterable, Literal, Optional, Union
+from typing import Literal, Optional, Union
 
 import numpy as np
 from rich.console import Console
@@ -18,7 +18,6 @@
 from turftopic.feature_importance import (bayes_rule,
                                           cluster_centroid_distance, ctf_idf,
                                           soft_ctf_idf)
-from turftopic.hierarchical import TopicNode
 from turftopic.vectorizer import default_vectorizer
 
 integer_message = """
@@ -231,12 +230,11 @@ def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray:
             ]
         )
         old_labels = [label for label in self.classes_ if label != -1]
-        clustering = AgglomerativeClustering(
+        new_labels = AgglomerativeClustering(
             n_clusters=n_reduce_to,
             metric="cosine",
             linkage="average",
-        )
-        new_labels = clustering.fit_predict(interesting_topic_vectors)
+        ).fit_predict(interesting_topic_vectors)
         res = {}
         if -1 in self.classes_:
             res[-1] = -1
@@ -256,58 +254,6 @@ def _merge_smallest(self, n_reduce_to: int):
             labels[labels == from_topic] = to_topic
         return labels
 
-    def join_subtopics(
-        self, subtopics: Iterable[int], hierarchy: Optional[TopicNode] = None
-    ) -> TopicNode:
-        """Joins subtopics in a topic hierarchy and returns the joint TopicNode.
-        > Note that this method does not alter the underlying hierarchy!
-        > You will need to use the join() method of a hierarchy for that.
-
-        Parameters
-        ----------
-        subtopics: iterable of int
-            Indices of subtopics to be joint.
-        hierarchy: TopicNode, default None
-            Hierarchy to join subtopics in, defaults to the root hierarchy of the model.
-
-        Returns
-        -------
-        TopicNode
-            New topic made up of the joint subtopics.
-        """
-        if hierarchy is None:
-            hierarchy = self.hierarchy
-        subtopics = list(set(subtopics))
-        slot = min(subtopics)
-        max_subtopics = max(subtopics)
-        if len(self.children) < (max_subtopics - 1):
-            raise ValueError(
-                "These subtopics don't exist on the current node."
-            )
-        if slot < 0:
-            raise ValueError(
-                "Outlier topics (-1) cannot be merged with other topics."
-            )
-        if self.children is None:
-            raise ValueError(
-                "Current Node is a leaf, children can't be joined."
-            )
-        path = (*hierarchy.path, slot)
-        children = [self.hierarchy[sub] for sub in subtopics]
-        doc_topic_vector = self.hierarchy.doc_topic_matrix[:, subtopics].sum(
-            axis=1
-        )
-        rest = [
-            doc_topic_vector
-            for i_topic, doc_topic_vector in enumerate(
-                self.hierarchy.doc_topic_matrix.T
-            )
-            if i_topic not in subtopics
-        ]
-        doc_topic_matrix = np.stack([doc_topic_vector, rest]).T
-        # TODO
-        pass
-
     def reduce_topics(
         self,
         n_reduce_to: int,
@@ -340,7 +286,6 @@ def reduce_topics(
             self.labels_ = self._merge_smallest(n_reduce_to)
         elif reduction_method == "agglomerative":
             self.labels_ = self._merge_agglomerative(n_reduce_to)
-        self.estimate_components(self.feature_importance)
         return self.labels_
 
     def reset_reduction(self):
@@ -381,10 +326,6 @@ def estimate_components(
             )
         clusters = np.unique(self.labels_)
         self.classes_ = np.sort(clusters)
-        if -1 in self.classes_:
-            # Putting outliers in the last position, so that when you index things,
-            # it works.
-            self.classes_ = np.array([*self.classes_[1:], -1])
         self.topic_sizes_ = np.array(
             [np.sum(self.labels_ == label) for label in self.classes_]
         )