Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add dynamic topic modeling to clustering models #28

Merged
merged 9 commits into from
Mar 21, 2024
5 changes: 5 additions & 0 deletions docs/clustering.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,11 @@ top2vec = ClusteringTopicModel(
Theoretically the model descriptions above should result in the same behaviour as the other two packages, but there might be minor changes in implementation.
We do not intend to keep up with changes in Top2Vec's and BERTopic's internal implementation details indefinitely.

### _(Optional)_ 5. Dynamic Modeling

Clustering models are also capable of dynamic topic modeling. This happens by fitting a clustering model over the entire corpus, as we expect that there is only one semantic model generating the documents.
To gain temporal representations for topics, the corpus is divided into equal, or arbitrarily chosen time slices, and then term importances are estimated using Soft-c-TF-IDF, c-TF-IDF, or distances from cluster centroid for each of the time slices separately. When distance from cluster centroids is used to estimate topic importances in dynamic modeling, cluster centroids are computed based on documents and terms present within a given time slice.

## Considerations

### Strengths
Expand Down
2 changes: 1 addition & 1 deletion docs/dynamic.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Dynamic topic models in Turftopic have a unified interface.
To fit a dynamic topic model you will need a corpus, that has been annotated with timestamps.
The timestamps need to be Python `datetime` objects, but pandas `Timestamp` object are also supported.

Models that have dynamic modeling capabilities have a `fit_transform_dynamic()` method, that fits the model on the corpus over time.
Models that have dynamic modeling capabilities (currently, `GMM` and `ClusteringTopicModel`) have a `fit_transform_dynamic()` method, that fits the model on the corpus over time.

```python
from datetime import datetime
Expand Down
52 changes: 50 additions & 2 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datetime import datetime
import tempfile
from pathlib import Path

Expand All @@ -15,6 +16,21 @@
SemanticSignalSeparation,
)


def generate_dates(
n_dates: int,
) -> list[datetime]:
"""Generate random dates to test dynamic models"""
dates = []
for n in range(n_dates):
d = np.random.randint(low=1, high=29)
m = np.random.randint(low=1, high=13)
y = np.random.randint(low=2000, high=2020)
date = datetime(year=y, month=m, day=d)
dates.append(date)
return dates


newsgroups = fetch_20newsgroups(
subset="all",
categories=[
Expand All @@ -25,12 +41,13 @@
texts = newsgroups.data
trf = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = np.asarray(trf.encode(texts))
timestamps = generate_dates(n_dates=len(texts))

models = [
GMM(5, encoder=trf),
SemanticSignalSeparation(5, encoder=trf),
KeyNMF(5, encoder=trf, keyword_scope='document'),
KeyNMF(5, encoder=trf, keyword_scope='corpus'),
KeyNMF(5, encoder=trf, keyword_scope="document"),
KeyNMF(5, encoder=trf, keyword_scope="corpus"),
ClusteringTopicModel(
n_reduce_to=5,
feature_importance="c-tf-idf",
Expand All @@ -46,6 +63,22 @@
AutoEncodingTopicModel(5, combined=True),
]

dynamic_models = [
GMM(5, encoder=trf),
ClusteringTopicModel(
x-tabdeveloping marked this conversation as resolved.
Show resolved Hide resolved
n_reduce_to=5,
feature_importance="centroid",
encoder=trf,
reduction_method="smallest",
),
ClusteringTopicModel(
n_reduce_to=5,
feature_importance="soft-c-tf-idf",
encoder=trf,
reduction_method="smallest"
),
]


@pytest.mark.parametrize("model", models)
def test_fit_export_table(model):
Expand All @@ -56,3 +89,18 @@ def test_fit_export_table(model):
with out_path.open("w") as out_file:
out_file.write(table)
df = pd.read_csv(out_path)


@pytest.mark.parametrize("model", dynamic_models)
def test_fit_dynamic(model):
doc_topic_matrix = model.fit_transform_dynamic(
texts,
embeddings=embeddings,
timestamps=timestamps,
)
table = model.export_topics(format="csv")
with tempfile.TemporaryDirectory() as tmpdirname:
out_path = Path(tmpdirname).joinpath("topics.csv")
with out_path.open("w") as out_file:
out_file.write(table)
df = pd.read_csv(out_path)
36 changes: 27 additions & 9 deletions turftopic/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ def remove_whitespace(text: str) -> str:
class ContextualModel(ABC, TransformerMixin, BaseEstimator):
"""Base class for contextual topic models in Turftopic."""

def get_topics(self, top_k: int = 10) -> List[Tuple[Any, List[Tuple[str, float]]]]:
def get_topics(
self, top_k: int = 10
) -> List[Tuple[Any, List[Tuple[str, float]]]]:
"""Returns high-level topic representations in form of the top K words
in each topic.

Expand Down Expand Up @@ -135,8 +137,12 @@ def _highest_ranking_docs(
except AttributeError:
pass
kth = min(top_k, document_topic_matrix.shape[0] - 1)
highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[:kth]
highest = highest[np.argsort(-document_topic_matrix[highest, topic_id])]
highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[
:kth
]
highest = highest[
np.argsort(-document_topic_matrix[highest, topic_id])
]
scores = document_topic_matrix[highest, topic_id]
columns = []
columns.append("Document")
Expand Down Expand Up @@ -171,7 +177,9 @@ def print_highest_ranking_documents(
topic_id, raw_documents, document_topic_matrix, top_k
)
table = Table(show_lines=True)
table.add_column("Document", justify="left", style="magenta", max_width=100)
table.add_column(
"Document", justify="left", style="magenta", max_width=100
)
table.add_column("Score", style="blue", justify="right")
for row in rows:
table.add_row(*row)
Expand Down Expand Up @@ -223,7 +231,9 @@ def _topic_distribution(
) -> list[list[str]]:
if topic_dist is None:
if text is None:
raise ValueError("You should either pass a text or a distribution.")
raise ValueError(
"You should either pass a text or a distribution."
)
try:
topic_dist = self.transform([text])
except AttributeError:
Expand All @@ -248,7 +258,9 @@ def _topic_distribution(
rows.append([topic_names[ind], f"{score:.2f}"])
return [columns, *rows]

def print_topic_distribution(self, text=None, topic_dist=None, top_k: int = 10):
def print_topic_distribution(
self, text=None, topic_dist=None, top_k: int = 10
):
"""Pretty prints topic distribution in a document.

Parameters
Expand Down Expand Up @@ -330,7 +342,9 @@ def fit_transform(
"""
pass

def fit(self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None):
def fit(
self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None
):
"""Fits model on the given corpus.

Parameters
Expand Down Expand Up @@ -396,9 +410,13 @@ def prepare_topic_data(
if embeddings is None:
embeddings = self.encode_documents(corpus)
try:
document_topic_matrix = self.transform(corpus, embeddings=embeddings)
document_topic_matrix = self.transform(
corpus, embeddings=embeddings
)
except (AttributeError, NotFittedError):
document_topic_matrix = self.fit_transform(corpus, embeddings=embeddings)
document_topic_matrix = self.fit_transform(
corpus, embeddings=embeddings
)
dtm = self.vectorizer.transform(corpus) # type: ignore
res: TopicData = {
"corpus": corpus,
Expand Down
4 changes: 3 additions & 1 deletion turftopic/dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,9 @@ def print_topics_over_time(
show_scores: bool, default False
Indicates whether to show importance scores for each word.
"""
columns, *rows = self._topics_over_time(top_k, show_scores, date_format)
columns, *rows = self._topics_over_time(
top_k, show_scores, date_format
)
table = Table(show_lines=True)
for column in columns:
table.add_column(column)
Expand Down
2 changes: 1 addition & 1 deletion turftopic/encoders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@
"OpenAIEmbeddings",
"VoyageEmbeddings",
"ExternalEncoder",
"E5Encoder"
"E5Encoder",
]
12 changes: 12 additions & 0 deletions turftopic/encoders/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import itertools
from typing import Iterable, List


def batched(iterable, n: int) -> Iterable[List[str]]:
"Batch data into tuples of length n. The last batch may be shorter."
# batched('ABCDEFG', 3) --> ABC DEF G
if n < 1:
raise ValueError("n must be at least one")
it = iter(iterable)
while batch := list(itertools.islice(it, n)):
yield batch
Loading
Loading