Merge pull request #731 from ajdapretnar/owrelevant-terms

OWLDAvis
biolab · Nov 15, 2021 · 1dfa334 · 1dfa334
2 parents ebfd330 + fea8f68
commit 1dfa334
Show file tree

Hide file tree

Showing 15 changed files with 707 additions and 31 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -5,7 +5,7 @@ recursive-include orangecontrib/text/tests *.txt *.json
 recursive-include orangecontrib/text/tutorials *.ows
 recursive-include orangecontrib/text/widgets/icons *.svg *.png *.ai
 recursive-include orangecontrib/text/widgets/resources *.js *.css *.html
-recursive-include orangecontrib/text/widgets/tests/data *.docx *.odt *.pdf *.txt *.conllu
+recursive-include orangecontrib/text/widgets/tests/data *.docx *.odt *.pdf *.txt *.conllu *.csv *.tab *.tab.metadata
 include orangecontrib/text/widgets/tests/bow-test
 recursive-include scripts *.sh *.py
 

diff --git a/doc/index.rst b/doc/index.rst
@@ -21,6 +21,7 @@ Widgets
    widgets/sentimentanalysis
    widgets/tweetprofiler
    widgets/topicmodelling-widget
+   widgets/LDAvis
    widgets/corpusviewer
    widgets/wordcloud
    widgets/concordance

diff --git a/doc/widgets.json b/doc/widgets.json
@@ -44,13 +44,6 @@
     "background": "light-blue",
     "keywords": []
    },
-   {
-    "text": "Wikipedia",
-    "doc": "widgets/wikipedia-widget.md",
-    "icon": "../orangecontrib/text/widgets/icons/Wikipedia.svg",
-    "background": "light-blue",
-    "keywords": []
-   },
    {
     "text": "Preprocess Text",
     "doc": "widgets/preprocesstext.md",
@@ -123,13 +116,27 @@
      "LDA"
     ]
    },
+   {
+    "text": "LDAvis",
+    "doc": "widgets/LDAvis.md",
+    "icon": "../orangecontrib/text/widgets/icons/LDAvis.svg",
+    "background": "light-blue",
+    "keywords": []
+   },
    {
     "text": "Corpus Viewer",
     "doc": "widgets/corpusviewer.md",
     "icon": "../orangecontrib/text/widgets/icons/CorpusViewer.svg",
     "background": "light-blue",
     "keywords": []
    },
+   {
+    "text": "Score Documents",
+    "doc": "widgets/score-documents.md",
+    "icon": "../orangecontrib/text/widgets/icons/ScoreDocuments.svg",
+    "background": "light-blue",
+    "keywords": []
+   },
    {
     "text": "Word Cloud",
     "doc": "widgets/wordcloud.md",
@@ -167,6 +174,23 @@
     "background": "light-blue",
     "keywords": []
    },
+   {
+    "text": "Word List",
+    "doc": null,
+    "icon": "../orangecontrib/text/widgets/icons/WordList.svg",
+    "background": "light-blue",
+    "keywords": []
+   },
+   {
+    "text": "Extract Keywords",
+    "doc": "widgets/keywords.md",
+    "icon": "../orangecontrib/text/widgets/icons/Keywords.svg",
+    "background": "light-blue",
+    "keywords": [
+     "characteristic",
+     "term"
+    ]
+   },
    {
     "text": "Statistics",
     "doc": "widgets/statistics.md",

diff --git a/doc/widgets/LDAvis.md b/doc/widgets/LDAvis.md
@@ -0,0 +1,33 @@
+LDAvis
+======
+
+Interactive exploration of LDA topics.
+
+**Inputs**
+
+- Topics: All LDA topics from topic modeling.
+
+**LDAvis** is an Orange implementation of pyLDAvis, which is further derived from an R package LDAvis by Sievert and Shirely (2014). The visualization is designed for the exploration of LDA topic models by adjusting the weights of top ranked topic words. Parameter Relevance adjusts word weights and it can be between 0 and 1, where 1 represents the words ranked solely by their topic frequency, and 0 those ranked solely by their lift (ratio of the probability of the word in a topic to the probability of thw word in the corpus).
+
+![](images/LDAvis.png)
+
+1. Parameter for adjusting the word's relevance in the topic. 1 would show words as seen in the Topic Modeling widget.
+2. List of topics.
+
+The plot on the right shows the top 20 words as ranked by the computed relevance. The default relevance is 0.5, which strikes a good balance between in-topic probability and lift (the authors suggest 0.6). The weights (x axis) show absolute word counts. The red bar shows in-topic probability, while the gray bar shows the in-corpus probability of a given word.
+
+Example
+-------
+
+We show how to approximate the exact LDAvis in Orange. We are using *book-excerpts.tab* corpus in the example, which we load with [Corpus](corpus-widget.md), pass it to [Preprocess Text](preprocesstext.md), then to [Bag of Words](bagofwords-widget.md) and finally to [Topic Modelling](topicmodelling-widget.md), where we compute 10 topics with the LDA method.
+
+Next, we pass the *All Topics* output to **LDAvis** and to **MDS**. LDAvis shows the top ranked words for Topic 3, which seems to describe an industrial realm. One can adjust the Relevance slider to interactively explore word rankings.
+
+In MDS, we have set the color and size of the point to *Marginal Topic Probability* and labelled the point with their topic names. This represent the left part of LDAvis visualization, namely the PCA-based MDS projection of topic similarity and their corresponding weights in the corpus. It seems that Topic 8 is the most frequent topic in the corpus.
+
+![](images/LDAvis-Example.png)
+
+References
+----------
+
+Sievert, Carson  and Kenneth Shirley (2014). LDAvis: A method for visualizing and interpreting topics. In Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces. [Available online.](https://aclanthology.org/W14-3110)
diff --git a/doc/widgets/images/LDAvis-Example.png b/doc/widgets/images/LDAvis-Example.png
diff --git a/doc/widgets/images/LDAvis.png b/doc/widgets/images/LDAvis.png
diff --git a/orangecontrib/text/tests/test_topic_modeling.py b/orangecontrib/text/tests/test_topic_modeling.py
@@ -68,9 +68,9 @@ def test_marginal_probability(self):
         doc_topics = np.array([[0.6, 0.1, 0.3],
                                [0.2, 0.6, 0.2],
                                [0.2, 0.3, 0.5]])
-        np.testing.assert_allclose(self.model._marginal_probability(
-                                   tokens, doc_topics),
-                                   [[0.37777778], [0.31111111], [0.31111111]])
+        marg_prob, num_tokens = self.model._marginal_probability(tokens, doc_topics)
+        np.testing.assert_allclose(marg_prob, [[0.37777778], [0.31111111], [0.31111111]])
+        self.assertEqual(9, num_tokens)
 
     def test_existing_attributes(self):
         """ doc_topic should not include existing X of corpus, just topics """

diff --git a/orangecontrib/text/topics/topics.py b/orangecontrib/text/topics/topics.py
@@ -1,5 +1,3 @@
-from collections import Counter
-
 from gensim import matutils
 import numpy as np
 from gensim.corpora import Dictionary
@@ -31,7 +29,8 @@ class GensimWrapper:
     name = NotImplemented
     Model = NotImplemented
     num_topics = NotImplemented
-    has_negative_weights = False    # whether words can negatively contibute to a topic
+    has_negative_weights = False    # whether words can negatively contribute
+    # to a topic
 
     def __init__(self, **kwargs):
         for k, v in kwargs.items():
@@ -68,7 +67,8 @@ def reset_model(self, corpus):
         # prevent model from updating
         _update = self.Model.update
         self.Model.update = self.dummy_method
-        self.id2word = Dictionary(corpus.ngrams_iterator(include_postags=True), prune_at=None)
+        self.id2word = Dictionary(corpus.ngrams_iterator(include_postags=True),
+                                  prune_at=None)
         self.model = self.Model(corpus=corpus,
                                 id2word=self.id2word, **self.kwargs)
         self.Model.update = _update
@@ -132,10 +132,13 @@ def _marginal_probability(tokens, doc_topic):
         topic across all documents.
 
         :return: np.array of marginal topic probabilities
+        :return: number of tokens
         """
         doc_length = [len(i) for i in tokens]
-        doc_length[:] = [x / sum(doc_length) for x in doc_length]
-        return np.reshape(np.sum(doc_topic.T * doc_length, axis=1), (-1, 1))
+        num_tokens = sum(doc_length)
+        doc_length[:] = [x / num_tokens for x in doc_length]
+        return np.reshape(np.sum(doc_topic.T * doc_length, axis=1), (-1, 1)),\
+            num_tokens
 
     def get_all_topics_table(self):
         """ Transform all topics from gensim model to table. """
@@ -150,26 +153,24 @@ def get_all_topics_table(self):
             X.append(weights)
         X = np.array(X)
 
-
         # take only first n_topics; e.g. when user requested 10, but gensim
         # returns only 9 — when the rank is lower than num_topics requested
         names = np.array(self.topic_names[:n_topics], dtype=object)[:, None]
 
         attrs = [ContinuousVariable(w) for w in sorted_words]
-        corpus_counter = Counter(w for doc in self.tokens for w in doc)
-        n_tokens = sum(corpus_counter.values())
-        for attr in attrs:
-            attr.attributes = {'word-frequency': corpus_counter[attr.name]/n_tokens}
         metas = [StringVariable('Topics'),
                  ContinuousVariable('Marginal Topic Probability')]
 
-        topic_proba = np.array(self._marginal_probability(self.tokens,
-                                                          self.doc_topic),
-                               dtype=object)
+        marg_proba, num_tokens = self._marginal_probability(self.tokens,
+                                                            self.doc_topic)
+        topic_proba = np.array(marg_proba, dtype=object)
 
         t = Topics.from_numpy(Domain(attrs, metas=metas), X=X,
                               metas=np.hstack((names, topic_proba)))
         t.name = 'All topics'
+        # required for distinguishing between models in OWRelevantTerms
+        t.attributes.update([('Model', f'{self.name}'),
+                             ('Number of tokens', num_tokens)])
         return t
 
     def get_top_words_by_id(self, topic_id, num_of_words=10):

diff --git a/orangecontrib/text/widgets/icons/LDAvis.svg b/orangecontrib/text/widgets/icons/LDAvis.svg