Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Word Cloud: Show bag of words weights when bag of words features available #486

Merged
merged 1 commit into from
Dec 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions doc/widgets/wordcloud.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ Generates a word cloud from corpus.
**Outputs**

- Corpus: Documents that match the selection.
- Word: Selected word that can be used as query in [Concordance](concordance.md).
- Selected Word: Selected word that can be used as query in [Concordance](concordance.md).
- Word Counts: Words and their weights.

**Word Cloud** displays tokens in the corpus, their size denoting the frequency of the word in corpus. Words are listed by their frequency (weight) in the widget. The widget outputs documents, containing selected tokens from the word cloud.
**Word Cloud** displays tokens in the corpus, their size denoting the frequency of the word in corpus or an average bag of words count, when bag of words features are at the input of the widget. Words are listed by their frequency (weight) in the widget. The widget outputs documents, containing selected tokens from the word cloud.

![](images/Word-Cloud-stamped.png)

Expand Down
38 changes: 35 additions & 3 deletions orangecontrib/text/widgets/owwordcloud.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# coding: utf-8
from collections import Counter
from math import pi as PI
from operator import itemgetter

import numpy as np
from AnyQt.QtCore import Qt, QItemSelection, QItemSelectionModel, pyqtSlot, \
Expand Down Expand Up @@ -40,6 +41,9 @@ class Outputs:
class Warning(widget.OWWidget.Warning):
topic_precedence = widget.Msg('Input signal Topic takes priority over Corpus')

class Info(widget.OWWidget.Information):
bow_weights = widget.Msg("Showing bag of words weights.")

def __init__(self):
super().__init__()
self.n_topic_words = 0
Expand Down Expand Up @@ -250,17 +254,45 @@ def create_weight_list(self):
@Inputs.corpus
def on_corpus_change(self, data):
self.corpus = data
self.Info.clear()

self.corpus_counter = Counter()
if data is not None:
self.corpus_counter = Counter(w for doc in data.ngrams for w in doc)
bow_counts = self._bow_words()
if bow_counts:
self.Info.bow_weights()
self.corpus_counter = Counter(bow_counts)
else:
self.corpus_counter = Counter(w for doc in data.ngrams for w in doc)
n_docs, n_words = len(data), len(self.corpus_counter)

self.documents_info_str = ('{} documents with {} words'.format(n_docs, n_words)
if data else '(no documents on input)')
self.documents_info_str = (
'{} documents with {} words'.format(n_docs, n_words)
if data else '(no documents on input)')

self.create_weight_list()

def _bow_words(self):
"""
This function extract words from bag of words features and assign them
the frequency which is average bow count.
"""
bow_features = self._get_bow_variables()
if not bow_features:
return {}

average_bows = {
f.name: self.corpus.get_column_view(f)[0].mean()
for f in bow_features}
return average_bows

def _get_bow_variables(self):
"""
Extract bow variables from data
"""
return [var for var in self.corpus.domain.variables
if var.attributes.get("bow-feature", False)]

def handleNewSignals(self):
if self.topic is not None and len(self.topic):
self._apply_topic()
Expand Down
69 changes: 69 additions & 0 deletions orangecontrib/text/widgets/tests/test_owworldcloud.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import unittest
import numpy as np

from Orange.widgets.tests.base import WidgetTest
from scipy.sparse import csr_matrix

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.widgets.owwordcloud import OWWordCloud

Expand Down Expand Up @@ -29,6 +32,72 @@ def test_empty_data(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus[:0])
self.assertTrue(self.widget.documents_info_str == "(no documents on input)")

def test_bow_features(self):
"""
When bag of words features are at the input word cloud must be made
based on BOW weights.
"""
data = self.corpus[:3]
data.extend_attributes(
csr_matrix([[3, 2, 0], [0, 3, 6], [0, 1, 0]]),
["Word1", "Word2", "Word3"])
for v in data.domain.attributes:
v.attributes["bow-feature"] = True

self.send_signal(self.widget.Inputs.corpus, data)
self.assertDictEqual(
self.widget.corpus_counter, {"Word1": 1, "Word2": 2, "Word3": 2})
output = self.get_output(self.widget.Outputs.word_counts)
np.testing.assert_array_equal([2, 2, 1], output.X.flatten())
np.testing.assert_array_equal(
["Word2", "Word3", "Word1"], output.metas.flatten())
self.assertListEqual(
[(2.0, 'Word2'), (2.0, 'Word3'), (1.0, 'Word1')],
self.widget.tablemodel[:])

# try with one word not bow-feature
data = self.corpus[:3]
data.extend_attributes(
csr_matrix([[3, 2, 0], [0, 3, 6], [0, 1, 0]]),
["Word1", "Word2", "Word3"])
for v in data.domain.attributes[:2]:
v.attributes["bow-feature"] = True

self.send_signal(self.widget.Inputs.corpus, data)
self.assertDictEqual(
self.widget.corpus_counter, {"Word1": 1, "Word2": 2})
output = self.get_output(self.widget.Outputs.word_counts)
np.testing.assert_array_equal([2, 1], output.X.flatten())
np.testing.assert_array_equal(
["Word2", "Word1"], output.metas.flatten())
self.assertListEqual(
[(2.0, 'Word2'), (1.0, 'Word1')],
self.widget.tablemodel[:])

def test_bow_info(self):
"""
Widget shows info when bow-features used. This test tests this info.
"""
data = self.corpus[:3]

# no data no info
self.assertFalse(self.widget.Info.bow_weights.is_shown())
self.send_signal(self.widget.Inputs.corpus, data)
self.assertFalse(self.widget.Info.bow_weights.is_shown())
self.send_signal(self.widget.Inputs.corpus, None)
self.assertFalse(self.widget.Info.bow_weights.is_shown())

# send bow data
data.extend_attributes(
csr_matrix([[3, 2, 0], [0, 3, 6], [0, 1, 0]]),
["Word1", "Word2", "Word3"])
for v in data.domain.attributes:
v.attributes["bow-feature"] = True
self.send_signal(self.widget.Inputs.corpus, data)
self.assertTrue(self.widget.Info.bow_weights.is_shown())
self.send_signal(self.widget.Inputs.corpus, None)
self.assertFalse(self.widget.Info.bow_weights.is_shown())


if __name__ == "__main__":
unittest.main()