Merge pull request #687 from ajdapretnar/topic-evaluation

Topic Modeling: Add topic evaluation scores
biolab · Aug 13, 2021 · b838706 · b838706
2 parents a7fb35c + 664a1bb
commit b838706
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 54 deletions.
diff --git a/orangecontrib/text/widgets/owtopicmodeling.py b/orangecontrib/text/widgets/owtopicmodeling.py
@@ -1,10 +1,16 @@
 import functools
+from typing import Any
+
+import numpy as np
 
 from AnyQt import QtGui, QtCore
 from AnyQt.QtCore import pyqtSignal, QSize
 from AnyQt.QtWidgets import (QVBoxLayout, QButtonGroup, QRadioButton,
                              QGroupBox, QTreeWidgetItem, QTreeWidget,
                              QStyleOptionViewItem, QStyledItemDelegate, QStyle)
+from Orange.widgets.utils.concurrent import TaskState, ConcurrentWidgetMixin
+
+from gensim.models import CoherenceModel
 
 from Orange.widgets import settings
 from Orange.widgets import gui
@@ -13,7 +19,7 @@
 from Orange.data import Table
 from orangecontrib.text.corpus import Corpus
 from orangecontrib.text.topics import Topic, LdaWrapper, HdpWrapper, LsiWrapper
-from orangecontrib.text.widgets.utils.concurrent import asynchronous
+from orangecontrib.text.topics.topics import GensimWrapper
 
 
 class TopicWidget(gui.OWComponent, QGroupBox):
@@ -96,7 +102,16 @@ def wrapper(self, *args, **kwargs):
     return decorator
 
 
-class OWTopicModeling(OWWidget):
+def _run(corpus: Corpus, model: GensimWrapper, state: TaskState):
+    def callback(i: float):
+        state.set_progress_value(i * 100)
+        if state.is_interruption_requested():
+            raise Exception
+
+    return model.fit_transform(corpus.copy(), chunk_number=100, on_progress=callback)
+
+
+class OWTopicModeling(OWWidget, ConcurrentWidgetMixin):
     name = "Topic Modelling"
     description = "Uncover the hidden thematic structure in a corpus."
     icon = "icons/TopicModeling.svg"
@@ -137,11 +152,18 @@ class Outputs:
     class Warning(OWWidget.Warning):
         less_topics_found = Msg('Less topics found than requested.')
 
+    class Error(OWWidget.Error):
+        unexpected_error = Msg("{}")
+
     def __init__(self):
         super().__init__()
+        ConcurrentWidgetMixin.__init__(self)
+
         self.corpus = None
         self.learning_thread = None
         self.__pending_selection = self.selection
+        self.perplexity = "n/a"
+        self.coherence = "n/a"
 
         # Commit button
         gui.auto_commit(self.buttonsArea, self, 'autocommit', 'Commit', box=False)
@@ -168,6 +190,11 @@ def __init__(self):
         self.toggle_widgets()
         method_layout.addStretch()
 
+        box = gui.vBox(self.controlArea, "Topic evaluation")
+        gui.label(box, self, "Log perplexity: %(perplexity)s")
+        gui.label(box, self, "Topic coherence: %(coherence)s")
+        self.controlArea.layout().insertWidget(1, box)
+
         # Topics description
         self.topic_desc = TopicViewer()
         self.topic_desc.topicSelected.connect(self.send_topic_by_id)
@@ -199,43 +226,42 @@ def toggle_widgets(self):
             widget.setVisible(i == self.method_index)
 
     def apply(self):
-        self.learning_task.stop()
+        self.topic_desc.clear()
         if self.corpus is not None:
-            self.learning_task()
+            self.Warning.less_topics_found.clear()
+            self.start(_run, self.corpus, self.model)
         else:
-            self.on_result(None)
-
-    @asynchronous
-    def learning_task(self):
-        return self.model.fit_transform(self.corpus.copy(), chunk_number=100,
-                                        on_progress=self.on_progress)
-
-    @learning_task.on_start
-    def on_start(self):
-        self.Warning.less_topics_found.clear()
-        self.progressBarInit()
-        self.topic_desc.clear()
-
-    @learning_task.on_result
-    def on_result(self, corpus):
-        self.progressBarFinished()
-        self.Outputs.corpus.send(corpus)
-        if corpus is None:
             self.topic_desc.clear()
+            self.Outputs.corpus.send(None)
             self.Outputs.selected_topic.send(None)
             self.Outputs.all_topics.send(None)
-        else:
-            self.topic_desc.show_model(self.model)
-            if self.__pending_selection:
-                self.topic_desc.select(self.__pending_selection)
-                self.__pending_selection = None
-            if self.model.actual_topics != self.model.num_topics:
-                self.Warning.less_topics_found()
-            self.Outputs.all_topics.send(self.model.get_all_topics_table())
-
-    @learning_task.callback
-    def on_progress(self, p):
-        self.progressBarSet(100 * p)
+
+    def on_done(self, corpus):
+        self.Outputs.corpus.send(corpus)
+        self.topic_desc.show_model(self.model)
+        if self.__pending_selection:
+            self.topic_desc.select(self.__pending_selection)
+            self.__pending_selection = None
+
+        if self.model.actual_topics != self.model.num_topics:
+            self.Warning.less_topics_found()
+
+        if self.model.name == "Latent Dirichlet Allocation":
+            bound = self.model.model.log_perplexity(corpus.ngrams_corpus)
+            self.perplexity = "{:.5f}".format(np.exp2(-bound))
+        cm = CoherenceModel(
+            model=self.model.model, texts=corpus.tokens, corpus=corpus, coherence="c_v"
+        )
+        coherence = cm.get_coherence()
+        self.coherence = "{:.5f}".format(coherence)
+
+        self.Outputs.all_topics.send(self.model.get_all_topics_table())
+
+    def on_exception(self, ex: Exception):
+        self.Error.unexpected_error(str(ex))
+
+    def on_partial_result(self, result: Any) -> None:
+        pass
 
     def send_report(self):
         self.report_items(*self.widgets[self.method_index].report_model())
@@ -381,4 +407,3 @@ def sizeHint(self, option, index):
     widget.set_data(Corpus.from_file('deerwester'))
     widget.show()
     app.exec()
-    widget.saveSettings()
diff --git a/orangecontrib/text/widgets/tests/test_owtopicmodeling.py b/orangecontrib/text/widgets/tests/test_owtopicmodeling.py
@@ -9,32 +9,22 @@
 
 
 class TestTopicModeling(WidgetTest):
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        cls.corpus = Corpus.from_file('deerwester')
-
     def setUp(self):
+        self.corpus = Corpus.from_file("deerwester")
         self.widget = self.create_widget(OWTopicModeling)
 
     def test_data(self):
-        def until():
-            return bool(self.get_output(self.widget.Outputs.selected_topic))
-
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
-        self.process_events(until)
+        self.wait_until_finished()
 
         self.send_signal(self.widget.Inputs.corpus, None)
         output = self.get_output(self.widget.Outputs.selected_topic)
         self.assertIsNone(output)
 
     def test_saved_selection(self):
-        def until(widget=self.widget):
-            return bool(self.get_output(widget.Outputs.selected_topic,
-                                        widget=widget))
-
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
-        self.process_events(until)
+        self.wait_until_finished()
+
         idx = self.widget.topic_desc.model().index(2, 0)
         self.widget.topic_desc.selectionModel().select(
             idx, QItemSelectionModel.Rows | QItemSelectionModel.ClearAndSelect)
@@ -43,7 +33,7 @@ def until(widget=self.widget):
 
         w = self.create_widget(OWTopicModeling, stored_settings=state)
         self.send_signal(w.Inputs.corpus, self.corpus, widget=w)
-        self.process_events(lambda: until(w))
+
         output2 = self.get_output(w.Outputs.selected_topic, widget=w)
         # gensim uses quicksort, so sorting is unstable
         m1 = output1.metas[output1.metas[:, 0].argsort()]
@@ -55,17 +45,30 @@ def until(widget=self.widget):
 
     def test_all_topics_output(self):
         # LSI produces 9 topics for deerwester, output should be 9
-        def until(widget=self.widget):
-            return bool(self.get_output(widget.Outputs.selected_topic,
-                                        widget=widget))
 
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
-        self.process_events(until)
         output = self.get_output(self.widget.Outputs.all_topics)
+
         self.assertEqual(len(output), self.widget.model.actual_topics)
         self.assertEqual(output.metas.shape[1],
                          self.widget.corpus.metas.shape[1] + 1)
 
+    def test_topic_evaluation(self):
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.wait_until_finished()
+
+        # test LSI
+        self.assertEqual(self.widget.perplexity, "n/a")
+        self.assertNotEqual(self.widget.coherence, "n/a")
+
+        # test LDA, which is the only one with log perplexity
+        self.widget.method_index = 1
+        self.widget.commit()
+        self.wait_until_finished()
+
+        self.assertNotEqual(self.widget.perplexity, "n/a")
+        self.assertTrue(self.widget.coherence)
+
 
 if __name__ == "__main__":
     unittest.main()