From 2c928c8c5a39b7e1ebd727ed1a7faa8cf76fb15b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Wed, 26 Feb 2020 12:27:54 +0100 Subject: [PATCH] Word Cloud: Run counting in new thread --- orangecontrib/text/widgets/owwordcloud.py | 94 +++++++++++++------ .../text/widgets/tests/test_owworldcloud.py | 18 ++++ 2 files changed, 82 insertions(+), 30 deletions(-) diff --git a/orangecontrib/text/widgets/owwordcloud.py b/orangecontrib/text/widgets/owwordcloud.py index b8f4feb7d..d0cbade07 100644 --- a/orangecontrib/text/widgets/owwordcloud.py +++ b/orangecontrib/text/widgets/owwordcloud.py @@ -2,7 +2,7 @@ from collections import Counter from itertools import cycle from math import pi as PI -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple import numpy as np from AnyQt import QtCore @@ -12,8 +12,9 @@ from Orange.data import ContinuousVariable, Domain, StringVariable, Table from Orange.data.util import scale from Orange.widgets import gui, settings, widget +from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState from Orange.widgets.utils.itemmodels import PyTableModel -from Orange.widgets.widget import Input, Output +from Orange.widgets.widget import Input, Output, OWWidget from orangecontrib.text.corpus import Corpus from orangecontrib.text.topics import Topic @@ -26,6 +27,51 @@ N_BEST_PLOTTED = 200 +def _bow_words(corpus): + """ + This function extract words from bag of words features and assign them + the frequency which is average bow count. + """ + average_bows = { + f.name: corpus.X[:, i].mean() + for i, f in enumerate(corpus.domain.attributes) + if f.attributes.get("bow-feature", False) + } + # return only positive bow weights (those == 0 are non-existing words) + return {f: w for f, w in average_bows.items() if w > 0} + + +def count_words(data: Corpus, state: TaskState) -> Tuple[Counter, bool]: + """ + This function implements counting process of the word cloud widget and + is called in the separate thread by concurrent. + + Parameters + ---------- + data + Corpus with the data + state + State used to report status. + + Returns + ------- + Reports counts as a counter and boolean that tell whether the data were + retrieved on bag of words basis. + """ + state.set_status("Calculating...") + state.set_progress_value(0) + bow_counts = _bow_words(data) + state.set_progress_value(0.5) + if bow_counts: + corpus_counter = Counter(bow_counts) + else: + corpus_counter = Counter( + w for doc in data.ngrams for w in doc + ) + state.set_progress_value(1) + return corpus_counter, bool(bow_counts) + + class TableModel(PyTableModel): def __init__(self, precision, **kwargs): super().__init__(**kwargs) @@ -55,7 +101,7 @@ def set_precision(self, precision: int): self.precision = precision -class OWWordCloud(widget.OWWidget): +class OWWordCloud(OWWidget, ConcurrentWidgetMixin): name = "Word Cloud" priority = 510 icon = "icons/WordCloud.svg" @@ -85,7 +131,8 @@ class Info(widget.OWWidget.Information): bow_weights = widget.Msg("Showing bag of words weights.") def __init__(self): - super().__init__() + OWWidget.__init__(self) + ConcurrentWidgetMixin.__init__(self) self.n_topic_words = 0 self.documents_info_str = "" self.webview = None @@ -225,7 +272,7 @@ def define_colors( # positive and negative numbers palette = TOPIC_COLORS if self.words_color else GRAY_TOPIC_COLORS colors = { - word: palette[weight >= 0] + word: palette[int(weight >= 0)] for word, weight in zip(words, weights) } else: @@ -293,7 +340,6 @@ def is_whole(d): words, weights = words[:N_BEST_PLOTTED], weights[:N_BEST_PLOTTED] self.shown_words, self.shown_weights = words, weights - # Repopulate table self.tablemodel.set_precision( 0 if all(is_whole(w) for w in weights) else 2 @@ -316,12 +362,12 @@ def is_whole(d): len(word) * float(weight) for word, weight in self.wordlist ]) - self.on_cloud_pref_change() @Inputs.topic def on_topic_change(self, data): self.topic = data + self.handle_input() def _apply_topic(self): data = self.topic @@ -372,30 +418,19 @@ def on_corpus_change(self, data): self.corpus_counter = Counter() if data is not None: - bow_counts = self._bow_words() - if bow_counts: - self.Info.bow_weights() - self.corpus_counter = Counter(bow_counts) - else: - self.corpus_counter = Counter( - w for doc in data.ngrams for w in doc - ) + self.start(count_words, data) + else: + self.handle_input() self.create_weight_list() - def _bow_words(self): - """ - This function extract words from bag of words features and assign them - the frequency which is average bow count. - """ - average_bows = { - f.name: self.corpus.X[:, i].mean() - for i, f in enumerate(self.corpus.domain.attributes) - if f.attributes.get("bow-feature", False) - } - # return only positive bow weights (those == 0 are non-existing words) - return {f: w for f, w in average_bows.items() if w > 0} - - def handleNewSignals(self): + def on_done(self, result: Tuple[Counter, bool]) -> None: + self.corpus_counter = result[0] + self.create_weight_list() + if result[1]: + self.Info.bow_weights() + self.handle_input() + + def handle_input(self): if self.topic is not None and len(self.topic): self._apply_topic() elif self.corpus is not None and len(self.corpus): @@ -408,7 +443,6 @@ def handleNewSignals(self): self.Warning.topic_precedence( shown=self.corpus is not None and self.topic is not None ) - if self.topic is not None or self.corpus is not None: if self.selected_words: self.update_selection(self.selected_words) diff --git a/orangecontrib/text/widgets/tests/test_owworldcloud.py b/orangecontrib/text/widgets/tests/test_owworldcloud.py index 946fc1f48..81b1d69ae 100644 --- a/orangecontrib/text/widgets/tests/test_owworldcloud.py +++ b/orangecontrib/text/widgets/tests/test_owworldcloud.py @@ -40,6 +40,7 @@ def test_data(self): """ self.send_signal(self.widget.Inputs.corpus, self.corpus) self.send_signal(self.widget.Inputs.corpus, None) + self.wait_until_finished() def test_empty_data(self): """ @@ -48,6 +49,7 @@ def test_empty_data(self): """ self.send_signal(self.widget.Inputs.corpus, self.corpus) self.send_signal(self.widget.Inputs.corpus, self.corpus[:0]) + self.wait_until_finished() def test_bow_features(self): """ @@ -62,6 +64,7 @@ def test_bow_features(self): v.attributes["bow-feature"] = True self.send_signal(self.widget.Inputs.corpus, data) + self.wait_until_finished() weights = list(zip(*sorted(self.widget.corpus_counter.items())))[1] # due to computation error in computing mean use array_almost_equal np.testing.assert_array_almost_equal(weights, [1, 2, 2]) @@ -86,6 +89,7 @@ def test_bow_features(self): v.attributes["bow-feature"] = True self.send_signal(self.widget.Inputs.corpus, data) + self.wait_until_finished() weights = list(zip(*sorted(self.widget.corpus_counter.items())))[1] np.testing.assert_array_almost_equal(weights, [1, 2]) @@ -109,8 +113,10 @@ def test_bow_info(self): # no data no info self.assertFalse(self.widget.Info.bow_weights.is_shown()) self.send_signal(self.widget.Inputs.corpus, data) + self.wait_until_finished() self.assertFalse(self.widget.Info.bow_weights.is_shown()) self.send_signal(self.widget.Inputs.corpus, None) + self.wait_until_finished() self.assertFalse(self.widget.Info.bow_weights.is_shown()) # send bow data @@ -120,8 +126,10 @@ def test_bow_info(self): for v in data.domain.attributes: v.attributes["bow-feature"] = True self.send_signal(self.widget.Inputs.corpus, data) + self.wait_until_finished() self.assertTrue(self.widget.Info.bow_weights.is_shown()) self.send_signal(self.widget.Inputs.corpus, None) + self.wait_until_finished() self.assertFalse(self.widget.Info.bow_weights.is_shown()) def test_topic(self): @@ -141,43 +149,53 @@ def test_input_summary(self): insum = self.widget.info.set_input_summary = Mock() self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.wait_until_finished() insum.assert_called_with("42", "9 documents with 42 words\n") self.send_signal(self.widget.Inputs.topic, self.topic) + self.wait_until_finished() insum.assert_called_with( "42 | 10", "9 documents with 42 words\n10 words in a topic.") self.send_signal(self.widget.Inputs.corpus, None) + self.wait_until_finished() insum.assert_called_with(f"10", "10 words in a topic.") self.send_signal(self.widget.Inputs.topic, None) + self.wait_until_finished() insum.assert_called_with(self.widget.info.NoInput) self.send_signal(self.widget.Inputs.topic, self.topic) + self.wait_until_finished() insum.assert_called_with(f"10", "10 words in a topic.") def test_output_summary(self): outsum = self.widget.info.set_output_summary = Mock() self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.wait_until_finished() outsum.assert_called_with( "0 | 0 | 42", "0 documents\n0 selected words\n42 words with counts" ) self.send_signal(self.widget.Inputs.topic, self.topic) + self.wait_until_finished() outsum.assert_called_with( "0 | 0 | 42", "0 documents\n0 selected words\n42 words with counts" ) self.send_signal(self.widget.Inputs.corpus, None) + self.wait_until_finished() outsum.assert_called_with(self.widget.info.NoOutput) self.send_signal(self.widget.Inputs.topic, None) + self.wait_until_finished() outsum.assert_called_with(self.widget.info.NoOutput) def test_send_report(self): self.widget.send_report() self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.wait_until_finished() self.widget.send_report()