Skip to content

Commit

Permalink
Word Cloud: Run counting in new thread
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Feb 26, 2020
1 parent ecb7162 commit 2c928c8
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 30 deletions.
94 changes: 64 additions & 30 deletions orangecontrib/text/widgets/owwordcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from collections import Counter
from itertools import cycle
from math import pi as PI
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Tuple

import numpy as np
from AnyQt import QtCore
Expand All @@ -12,8 +12,9 @@
from Orange.data import ContinuousVariable, Domain, StringVariable, Table
from Orange.data.util import scale
from Orange.widgets import gui, settings, widget
from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
from Orange.widgets.utils.itemmodels import PyTableModel
from Orange.widgets.widget import Input, Output
from Orange.widgets.widget import Input, Output, OWWidget
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.topics import Topic

Expand All @@ -26,6 +27,51 @@
N_BEST_PLOTTED = 200


def _bow_words(corpus):
"""
This function extract words from bag of words features and assign them
the frequency which is average bow count.
"""
average_bows = {
f.name: corpus.X[:, i].mean()
for i, f in enumerate(corpus.domain.attributes)
if f.attributes.get("bow-feature", False)
}
# return only positive bow weights (those == 0 are non-existing words)
return {f: w for f, w in average_bows.items() if w > 0}


def count_words(data: Corpus, state: TaskState) -> Tuple[Counter, bool]:
"""
This function implements counting process of the word cloud widget and
is called in the separate thread by concurrent.
Parameters
----------
data
Corpus with the data
state
State used to report status.
Returns
-------
Reports counts as a counter and boolean that tell whether the data were
retrieved on bag of words basis.
"""
state.set_status("Calculating...")
state.set_progress_value(0)
bow_counts = _bow_words(data)
state.set_progress_value(0.5)
if bow_counts:
corpus_counter = Counter(bow_counts)
else:
corpus_counter = Counter(
w for doc in data.ngrams for w in doc
)
state.set_progress_value(1)
return corpus_counter, bool(bow_counts)


class TableModel(PyTableModel):
def __init__(self, precision, **kwargs):
super().__init__(**kwargs)
Expand Down Expand Up @@ -55,7 +101,7 @@ def set_precision(self, precision: int):
self.precision = precision


class OWWordCloud(widget.OWWidget):
class OWWordCloud(OWWidget, ConcurrentWidgetMixin):
name = "Word Cloud"
priority = 510
icon = "icons/WordCloud.svg"
Expand Down Expand Up @@ -85,7 +131,8 @@ class Info(widget.OWWidget.Information):
bow_weights = widget.Msg("Showing bag of words weights.")

def __init__(self):
super().__init__()
OWWidget.__init__(self)
ConcurrentWidgetMixin.__init__(self)
self.n_topic_words = 0
self.documents_info_str = ""
self.webview = None
Expand Down Expand Up @@ -225,7 +272,7 @@ def define_colors(
# positive and negative numbers
palette = TOPIC_COLORS if self.words_color else GRAY_TOPIC_COLORS
colors = {
word: palette[weight >= 0]
word: palette[int(weight >= 0)]
for word, weight in zip(words, weights)
}
else:
Expand Down Expand Up @@ -293,7 +340,6 @@ def is_whole(d):

words, weights = words[:N_BEST_PLOTTED], weights[:N_BEST_PLOTTED]
self.shown_words, self.shown_weights = words, weights

# Repopulate table
self.tablemodel.set_precision(
0 if all(is_whole(w) for w in weights) else 2
Expand All @@ -316,12 +362,12 @@ def is_whole(d):
len(word) * float(weight) for word, weight in
self.wordlist
])

self.on_cloud_pref_change()

@Inputs.topic
def on_topic_change(self, data):
self.topic = data
self.handle_input()

def _apply_topic(self):
data = self.topic
Expand Down Expand Up @@ -372,30 +418,19 @@ def on_corpus_change(self, data):

self.corpus_counter = Counter()
if data is not None:
bow_counts = self._bow_words()
if bow_counts:
self.Info.bow_weights()
self.corpus_counter = Counter(bow_counts)
else:
self.corpus_counter = Counter(
w for doc in data.ngrams for w in doc
)
self.start(count_words, data)
else:
self.handle_input()
self.create_weight_list()

def _bow_words(self):
"""
This function extract words from bag of words features and assign them
the frequency which is average bow count.
"""
average_bows = {
f.name: self.corpus.X[:, i].mean()
for i, f in enumerate(self.corpus.domain.attributes)
if f.attributes.get("bow-feature", False)
}
# return only positive bow weights (those == 0 are non-existing words)
return {f: w for f, w in average_bows.items() if w > 0}

def handleNewSignals(self):
def on_done(self, result: Tuple[Counter, bool]) -> None:
self.corpus_counter = result[0]
self.create_weight_list()
if result[1]:
self.Info.bow_weights()
self.handle_input()

def handle_input(self):
if self.topic is not None and len(self.topic):
self._apply_topic()
elif self.corpus is not None and len(self.corpus):
Expand All @@ -408,7 +443,6 @@ def handleNewSignals(self):
self.Warning.topic_precedence(
shown=self.corpus is not None and self.topic is not None
)

if self.topic is not None or self.corpus is not None:
if self.selected_words:
self.update_selection(self.selected_words)
Expand Down
18 changes: 18 additions & 0 deletions orangecontrib/text/widgets/tests/test_owworldcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def test_data(self):
"""
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.send_signal(self.widget.Inputs.corpus, None)
self.wait_until_finished()

def test_empty_data(self):
"""
Expand All @@ -48,6 +49,7 @@ def test_empty_data(self):
"""
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.send_signal(self.widget.Inputs.corpus, self.corpus[:0])
self.wait_until_finished()

def test_bow_features(self):
"""
Expand All @@ -62,6 +64,7 @@ def test_bow_features(self):
v.attributes["bow-feature"] = True

self.send_signal(self.widget.Inputs.corpus, data)
self.wait_until_finished()
weights = list(zip(*sorted(self.widget.corpus_counter.items())))[1]
# due to computation error in computing mean use array_almost_equal
np.testing.assert_array_almost_equal(weights, [1, 2, 2])
Expand All @@ -86,6 +89,7 @@ def test_bow_features(self):
v.attributes["bow-feature"] = True

self.send_signal(self.widget.Inputs.corpus, data)
self.wait_until_finished()
weights = list(zip(*sorted(self.widget.corpus_counter.items())))[1]
np.testing.assert_array_almost_equal(weights, [1, 2])

Expand All @@ -109,8 +113,10 @@ def test_bow_info(self):
# no data no info
self.assertFalse(self.widget.Info.bow_weights.is_shown())
self.send_signal(self.widget.Inputs.corpus, data)
self.wait_until_finished()
self.assertFalse(self.widget.Info.bow_weights.is_shown())
self.send_signal(self.widget.Inputs.corpus, None)
self.wait_until_finished()
self.assertFalse(self.widget.Info.bow_weights.is_shown())

# send bow data
Expand All @@ -120,8 +126,10 @@ def test_bow_info(self):
for v in data.domain.attributes:
v.attributes["bow-feature"] = True
self.send_signal(self.widget.Inputs.corpus, data)
self.wait_until_finished()
self.assertTrue(self.widget.Info.bow_weights.is_shown())
self.send_signal(self.widget.Inputs.corpus, None)
self.wait_until_finished()
self.assertFalse(self.widget.Info.bow_weights.is_shown())

def test_topic(self):
Expand All @@ -141,43 +149,53 @@ def test_input_summary(self):
insum = self.widget.info.set_input_summary = Mock()

self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.wait_until_finished()
insum.assert_called_with("42", "9 documents with 42 words\n")

self.send_signal(self.widget.Inputs.topic, self.topic)
self.wait_until_finished()
insum.assert_called_with(
"42 | 10", "9 documents with 42 words\n10 words in a topic.")

self.send_signal(self.widget.Inputs.corpus, None)
self.wait_until_finished()
insum.assert_called_with(f"10", "10 words in a topic.")

self.send_signal(self.widget.Inputs.topic, None)
self.wait_until_finished()
insum.assert_called_with(self.widget.info.NoInput)

self.send_signal(self.widget.Inputs.topic, self.topic)
self.wait_until_finished()
insum.assert_called_with(f"10", "10 words in a topic.")

def test_output_summary(self):
outsum = self.widget.info.set_output_summary = Mock()

self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.wait_until_finished()
outsum.assert_called_with(
"0 | 0 | 42", "0 documents\n0 selected words\n42 words with counts"
)

self.send_signal(self.widget.Inputs.topic, self.topic)
self.wait_until_finished()
outsum.assert_called_with(
"0 | 0 | 42", "0 documents\n0 selected words\n42 words with counts"
)

self.send_signal(self.widget.Inputs.corpus, None)
self.wait_until_finished()
outsum.assert_called_with(self.widget.info.NoOutput)

self.send_signal(self.widget.Inputs.topic, None)
self.wait_until_finished()
outsum.assert_called_with(self.widget.info.NoOutput)

def test_send_report(self):
self.widget.send_report()
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.wait_until_finished()
self.widget.send_report()


Expand Down

0 comments on commit 2c928c8

Please sign in to comment.