From 2c928c8c5a39b7e1ebd727ed1a7faa8cf76fb15b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Wed, 26 Feb 2020 12:27:54 +0100
Subject: [PATCH] Word Cloud: Run counting in new thread
---
orangecontrib/text/widgets/owwordcloud.py | 94 +++++++++++++------
.../text/widgets/tests/test_owworldcloud.py | 18 ++++
2 files changed, 82 insertions(+), 30 deletions(-)
diff --git a/orangecontrib/text/widgets/owwordcloud.py b/orangecontrib/text/widgets/owwordcloud.py
index b8f4feb7d..d0cbade07 100644
--- a/orangecontrib/text/widgets/owwordcloud.py
+++ b/orangecontrib/text/widgets/owwordcloud.py
@@ -2,7 +2,7 @@
from collections import Counter
from itertools import cycle
from math import pi as PI
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
import numpy as np
from AnyQt import QtCore
@@ -12,8 +12,9 @@
from Orange.data import ContinuousVariable, Domain, StringVariable, Table
from Orange.data.util import scale
from Orange.widgets import gui, settings, widget
+from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
from Orange.widgets.utils.itemmodels import PyTableModel
-from Orange.widgets.widget import Input, Output
+from Orange.widgets.widget import Input, Output, OWWidget
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.topics import Topic
@@ -26,6 +27,51 @@
N_BEST_PLOTTED = 200
+def _bow_words(corpus):
+ """
+ This function extract words from bag of words features and assign them
+ the frequency which is average bow count.
+ """
+ average_bows = {
+ f.name: corpus.X[:, i].mean()
+ for i, f in enumerate(corpus.domain.attributes)
+ if f.attributes.get("bow-feature", False)
+ }
+ # return only positive bow weights (those == 0 are non-existing words)
+ return {f: w for f, w in average_bows.items() if w > 0}
+
+
+def count_words(data: Corpus, state: TaskState) -> Tuple[Counter, bool]:
+ """
+ This function implements counting process of the word cloud widget and
+ is called in the separate thread by concurrent.
+
+ Parameters
+ ----------
+ data
+ Corpus with the data
+ state
+ State used to report status.
+
+ Returns
+ -------
+ Reports counts as a counter and boolean that tell whether the data were
+ retrieved on bag of words basis.
+ """
+ state.set_status("Calculating...")
+ state.set_progress_value(0)
+ bow_counts = _bow_words(data)
+ state.set_progress_value(0.5)
+ if bow_counts:
+ corpus_counter = Counter(bow_counts)
+ else:
+ corpus_counter = Counter(
+ w for doc in data.ngrams for w in doc
+ )
+ state.set_progress_value(1)
+ return corpus_counter, bool(bow_counts)
+
+
class TableModel(PyTableModel):
def __init__(self, precision, **kwargs):
super().__init__(**kwargs)
@@ -55,7 +101,7 @@ def set_precision(self, precision: int):
self.precision = precision
-class OWWordCloud(widget.OWWidget):
+class OWWordCloud(OWWidget, ConcurrentWidgetMixin):
name = "Word Cloud"
priority = 510
icon = "icons/WordCloud.svg"
@@ -85,7 +131,8 @@ class Info(widget.OWWidget.Information):
bow_weights = widget.Msg("Showing bag of words weights.")
def __init__(self):
- super().__init__()
+ OWWidget.__init__(self)
+ ConcurrentWidgetMixin.__init__(self)
self.n_topic_words = 0
self.documents_info_str = ""
self.webview = None
@@ -225,7 +272,7 @@ def define_colors(
# positive and negative numbers
palette = TOPIC_COLORS if self.words_color else GRAY_TOPIC_COLORS
colors = {
- word: palette[weight >= 0]
+ word: palette[int(weight >= 0)]
for word, weight in zip(words, weights)
}
else:
@@ -293,7 +340,6 @@ def is_whole(d):
words, weights = words[:N_BEST_PLOTTED], weights[:N_BEST_PLOTTED]
self.shown_words, self.shown_weights = words, weights
-
# Repopulate table
self.tablemodel.set_precision(
0 if all(is_whole(w) for w in weights) else 2
@@ -316,12 +362,12 @@ def is_whole(d):
len(word) * float(weight) for word, weight in
self.wordlist
])
-
self.on_cloud_pref_change()
@Inputs.topic
def on_topic_change(self, data):
self.topic = data
+ self.handle_input()
def _apply_topic(self):
data = self.topic
@@ -372,30 +418,19 @@ def on_corpus_change(self, data):
self.corpus_counter = Counter()
if data is not None:
- bow_counts = self._bow_words()
- if bow_counts:
- self.Info.bow_weights()
- self.corpus_counter = Counter(bow_counts)
- else:
- self.corpus_counter = Counter(
- w for doc in data.ngrams for w in doc
- )
+ self.start(count_words, data)
+ else:
+ self.handle_input()
self.create_weight_list()
- def _bow_words(self):
- """
- This function extract words from bag of words features and assign them
- the frequency which is average bow count.
- """
- average_bows = {
- f.name: self.corpus.X[:, i].mean()
- for i, f in enumerate(self.corpus.domain.attributes)
- if f.attributes.get("bow-feature", False)
- }
- # return only positive bow weights (those == 0 are non-existing words)
- return {f: w for f, w in average_bows.items() if w > 0}
-
- def handleNewSignals(self):
+ def on_done(self, result: Tuple[Counter, bool]) -> None:
+ self.corpus_counter = result[0]
+ self.create_weight_list()
+ if result[1]:
+ self.Info.bow_weights()
+ self.handle_input()
+
+ def handle_input(self):
if self.topic is not None and len(self.topic):
self._apply_topic()
elif self.corpus is not None and len(self.corpus):
@@ -408,7 +443,6 @@ def handleNewSignals(self):
self.Warning.topic_precedence(
shown=self.corpus is not None and self.topic is not None
)
-
if self.topic is not None or self.corpus is not None:
if self.selected_words:
self.update_selection(self.selected_words)
diff --git a/orangecontrib/text/widgets/tests/test_owworldcloud.py b/orangecontrib/text/widgets/tests/test_owworldcloud.py
index 946fc1f48..81b1d69ae 100644
--- a/orangecontrib/text/widgets/tests/test_owworldcloud.py
+++ b/orangecontrib/text/widgets/tests/test_owworldcloud.py
@@ -40,6 +40,7 @@ def test_data(self):
"""
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.send_signal(self.widget.Inputs.corpus, None)
+ self.wait_until_finished()
def test_empty_data(self):
"""
@@ -48,6 +49,7 @@ def test_empty_data(self):
"""
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.send_signal(self.widget.Inputs.corpus, self.corpus[:0])
+ self.wait_until_finished()
def test_bow_features(self):
"""
@@ -62,6 +64,7 @@ def test_bow_features(self):
v.attributes["bow-feature"] = True
self.send_signal(self.widget.Inputs.corpus, data)
+ self.wait_until_finished()
weights = list(zip(*sorted(self.widget.corpus_counter.items())))[1]
# due to computation error in computing mean use array_almost_equal
np.testing.assert_array_almost_equal(weights, [1, 2, 2])
@@ -86,6 +89,7 @@ def test_bow_features(self):
v.attributes["bow-feature"] = True
self.send_signal(self.widget.Inputs.corpus, data)
+ self.wait_until_finished()
weights = list(zip(*sorted(self.widget.corpus_counter.items())))[1]
np.testing.assert_array_almost_equal(weights, [1, 2])
@@ -109,8 +113,10 @@ def test_bow_info(self):
# no data no info
self.assertFalse(self.widget.Info.bow_weights.is_shown())
self.send_signal(self.widget.Inputs.corpus, data)
+ self.wait_until_finished()
self.assertFalse(self.widget.Info.bow_weights.is_shown())
self.send_signal(self.widget.Inputs.corpus, None)
+ self.wait_until_finished()
self.assertFalse(self.widget.Info.bow_weights.is_shown())
# send bow data
@@ -120,8 +126,10 @@ def test_bow_info(self):
for v in data.domain.attributes:
v.attributes["bow-feature"] = True
self.send_signal(self.widget.Inputs.corpus, data)
+ self.wait_until_finished()
self.assertTrue(self.widget.Info.bow_weights.is_shown())
self.send_signal(self.widget.Inputs.corpus, None)
+ self.wait_until_finished()
self.assertFalse(self.widget.Info.bow_weights.is_shown())
def test_topic(self):
@@ -141,43 +149,53 @@ def test_input_summary(self):
insum = self.widget.info.set_input_summary = Mock()
self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.wait_until_finished()
insum.assert_called_with("42", "9 documents with 42 words\n")
self.send_signal(self.widget.Inputs.topic, self.topic)
+ self.wait_until_finished()
insum.assert_called_with(
"42 | 10", "9 documents with 42 words\n10 words in a topic.")
self.send_signal(self.widget.Inputs.corpus, None)
+ self.wait_until_finished()
insum.assert_called_with(f"10", "10 words in a topic.")
self.send_signal(self.widget.Inputs.topic, None)
+ self.wait_until_finished()
insum.assert_called_with(self.widget.info.NoInput)
self.send_signal(self.widget.Inputs.topic, self.topic)
+ self.wait_until_finished()
insum.assert_called_with(f"10", "10 words in a topic.")
def test_output_summary(self):
outsum = self.widget.info.set_output_summary = Mock()
self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.wait_until_finished()
outsum.assert_called_with(
"0 | 0 | 42", "0 documents\n0 selected words\n42 words with counts"
)
self.send_signal(self.widget.Inputs.topic, self.topic)
+ self.wait_until_finished()
outsum.assert_called_with(
"0 | 0 | 42", "0 documents\n0 selected words\n42 words with counts"
)
self.send_signal(self.widget.Inputs.corpus, None)
+ self.wait_until_finished()
outsum.assert_called_with(self.widget.info.NoOutput)
self.send_signal(self.widget.Inputs.topic, None)
+ self.wait_until_finished()
outsum.assert_called_with(self.widget.info.NoOutput)
def test_send_report(self):
self.widget.send_report()
self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.wait_until_finished()
self.widget.send_report()