biolab · ajdapretnar · Dec 18, 2019 · Dec 17, 2019
diff --git a/doc/widgets/wordcloud.md b/doc/widgets/wordcloud.md
@@ -11,9 +11,10 @@ Generates a word cloud from corpus.
 **Outputs**
 
 - Corpus: Documents that match the selection.
-- Word: Selected word that can be used as query in [Concordance](concordance.md).
+- Selected Word: Selected word that can be used as query in [Concordance](concordance.md).
+- Word Counts: Words and their weights.
 
-**Word Cloud** displays tokens in the corpus, their size denoting the frequency of the word in corpus. Words are listed by their frequency (weight) in the widget. The widget outputs documents, containing selected tokens from the word cloud.
+**Word Cloud** displays tokens in the corpus, their size denoting the frequency of the word in corpus or an average bag of words count, when bag of words features are at the input of the widget. Words are listed by their frequency (weight) in the widget. The widget outputs documents, containing selected tokens from the word cloud.
 
 ![](images/Word-Cloud-stamped.png)
 

diff --git a/orangecontrib/text/widgets/owwordcloud.py b/orangecontrib/text/widgets/owwordcloud.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 from collections import Counter
 from math import pi as PI
+from operator import itemgetter
 
 import numpy as np
 from AnyQt.QtCore import Qt, QItemSelection, QItemSelectionModel, pyqtSlot, \
@@ -40,6 +41,9 @@ class Outputs:
     class Warning(widget.OWWidget.Warning):
         topic_precedence = widget.Msg('Input signal Topic takes priority over Corpus')
 
+    class Info(widget.OWWidget.Information):
+        bow_weights = widget.Msg("Showing bag of words weights.")
+
     def __init__(self):
         super().__init__()
         self.n_topic_words = 0
@@ -250,17 +254,45 @@ def create_weight_list(self):
     @Inputs.corpus
     def on_corpus_change(self, data):
         self.corpus = data
+        self.Info.clear()
 
         self.corpus_counter = Counter()
         if data is not None:
-            self.corpus_counter = Counter(w for doc in data.ngrams for w in doc)
+            bow_counts = self._bow_words()
+            if bow_counts:
+                self.Info.bow_weights()
+                self.corpus_counter = Counter(bow_counts)
+            else:
+                self.corpus_counter = Counter(w for doc in data.ngrams for w in doc)
             n_docs, n_words = len(data), len(self.corpus_counter)
 
-        self.documents_info_str = ('{} documents with {} words'.format(n_docs, n_words)
-                                   if data else '(no documents on input)')
+        self.documents_info_str = (
+            '{} documents with {} words'.format(n_docs, n_words)
+            if data else '(no documents on input)')
 
         self.create_weight_list()
 
+    def _bow_words(self):
+        """
+        This function extract words from bag of words features and assign them
+        the frequency which is average bow count.
+        """
+        bow_features = self._get_bow_variables()
+        if not bow_features:
+            return {}
+
+        average_bows = {
+            f.name: self.corpus.get_column_view(f)[0].mean()
+            for f in bow_features}
+        return average_bows
+
+    def _get_bow_variables(self):
+        """
+        Extract bow variables from data
+        """
+        return [var for var in self.corpus.domain.variables
+                if var.attributes.get("bow-feature", False)]
+
     def handleNewSignals(self):
         if self.topic is not None and len(self.topic):
             self._apply_topic()

diff --git a/orangecontrib/text/widgets/tests/test_owworldcloud.py b/orangecontrib/text/widgets/tests/test_owworldcloud.py
@@ -1,6 +1,9 @@
 import unittest
+import numpy as np
 
 from Orange.widgets.tests.base import WidgetTest
+from scipy.sparse import csr_matrix
+
 from orangecontrib.text.corpus import Corpus
 from orangecontrib.text.widgets.owwordcloud import OWWordCloud
 
@@ -29,6 +32,72 @@ def test_empty_data(self):
         self.send_signal(self.widget.Inputs.corpus, self.corpus[:0])
         self.assertTrue(self.widget.documents_info_str == "(no documents on input)")
 
+    def test_bow_features(self):
+        """
+        When bag of words features are at the input word cloud must be made
+        based on BOW weights.
+        """
+        data = self.corpus[:3]
+        data.extend_attributes(
+            csr_matrix([[3, 2, 0], [0, 3, 6], [0, 1, 0]]),
+            ["Word1", "Word2", "Word3"])
+        for v in data.domain.attributes:
+            v.attributes["bow-feature"] = True
+
+        self.send_signal(self.widget.Inputs.corpus, data)
+        self.assertDictEqual(
+            self.widget.corpus_counter, {"Word1": 1, "Word2": 2, "Word3": 2})
+        output = self.get_output(self.widget.Outputs.word_counts)
+        np.testing.assert_array_equal([2, 2, 1], output.X.flatten())
+        np.testing.assert_array_equal(
+            ["Word2", "Word3", "Word1"], output.metas.flatten())
+        self.assertListEqual(
+            [(2.0, 'Word2'), (2.0, 'Word3'), (1.0, 'Word1')],
+            self.widget.tablemodel[:])
+
+        # try with one word not bow-feature
+        data = self.corpus[:3]
+        data.extend_attributes(
+            csr_matrix([[3, 2, 0], [0, 3, 6], [0, 1, 0]]),
+            ["Word1", "Word2", "Word3"])
+        for v in data.domain.attributes[:2]:
+            v.attributes["bow-feature"] = True
+
+        self.send_signal(self.widget.Inputs.corpus, data)
+        self.assertDictEqual(
+            self.widget.corpus_counter, {"Word1": 1, "Word2": 2})
+        output = self.get_output(self.widget.Outputs.word_counts)
+        np.testing.assert_array_equal([2, 1], output.X.flatten())
+        np.testing.assert_array_equal(
+            ["Word2", "Word1"], output.metas.flatten())
+        self.assertListEqual(
+            [(2.0, 'Word2'), (1.0, 'Word1')],
+            self.widget.tablemodel[:])
+
+    def test_bow_info(self):
+        """
+        Widget shows info when bow-features used. This test tests this info.
+        """
+        data = self.corpus[:3]
+
+        # no data no info
+        self.assertFalse(self.widget.Info.bow_weights.is_shown())
+        self.send_signal(self.widget.Inputs.corpus, data)
+        self.assertFalse(self.widget.Info.bow_weights.is_shown())
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertFalse(self.widget.Info.bow_weights.is_shown())
+
+        # send bow data
+        data.extend_attributes(
+            csr_matrix([[3, 2, 0], [0, 3, 6], [0, 1, 0]]),
+            ["Word1", "Word2", "Word3"])
+        for v in data.domain.attributes:
+            v.attributes["bow-feature"] = True
+        self.send_signal(self.widget.Inputs.corpus, data)
+        self.assertTrue(self.widget.Info.bow_weights.is_shown())
+        self.send_signal(self.widget.Inputs.corpus, None)
+        self.assertFalse(self.widget.Info.bow_weights.is_shown())
+
 
 if __name__ == "__main__":
     unittest.main()