From cf96f20e7bbc15ddecf01e6e1c6efae3bdd35da5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Mon, 30 Dec 2019 14:29:32 +0100
Subject: [PATCH 1/2] Word Enrichment: Added ConcurrentWidgetMixin to compute
in another thread
---
.../text/widgets/owwordenrichment.py | 169 +++++++++++-------
1 file changed, 101 insertions(+), 68 deletions(-)
diff --git a/orangecontrib/text/widgets/owwordenrichment.py b/orangecontrib/text/widgets/owwordenrichment.py
index 92dfcf1e5..2c8fd633d 100644
--- a/orangecontrib/text/widgets/owwordenrichment.py
+++ b/orangecontrib/text/widgets/owwordenrichment.py
@@ -1,20 +1,48 @@
+from types import SimpleNamespace
+from typing import List, Optional, Any
+
import numpy as np
-from AnyQt.QtWidgets import QTreeWidget, QTreeView, QTreeWidgetItem, \
- QApplication
+from AnyQt.QtWidgets import QTreeWidget, QTreeView, QTreeWidgetItem
from Orange.data import Table, Domain
from Orange.widgets import gui
from Orange.widgets.settings import Setting
+from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
from Orange.widgets.widget import OWWidget, Msg, Input
from Orange.statistics.util import FDR
from PyQt5.QtCore import QSize
from orangecontrib.text import Corpus
from orangecontrib.text.util import np_sp_sum
from orangecontrib.text.stats import hypergeom_p_values
-from orangecontrib.text.vectorization import BowVectorizer
-class OWWordEnrichment(OWWidget):
+class Result(SimpleNamespace):
+ words: Optional[List[str]] = None
+ p_values: Optional[List[float]] = None
+ fdr_values: Optional[List[float]] = None
+
+
+class Runner:
+ @staticmethod
+ def run(
+ selected_data_transformed: Table,
+ data: Table,
+ result: Result,
+ state: TaskState
+ ) -> None:
+ state.set_status("Listing words")
+ result.words = [
+ i.name for i in selected_data_transformed.domain.attributes]
+ state.set_status("Computing p-values")
+ result.p_values = hypergeom_p_values(
+ data.X, selected_data_transformed.X,
+ callback=state.set_progress_value
+ )
+ state.set_status("Computing FDR values")
+ result.fdr_values = FDR(result.p_values)
+
+
+class OWWordEnrichment(OWWidget, ConcurrentWidgetMixin):
# Basic widget info
name = "Word Enrichment"
description = "Word enrichment analysis for selected documents."
@@ -35,28 +63,22 @@ class Error(OWWidget.Error):
all_selected = Msg('All examples can not be selected!')
# Settings
- filter_by_p = Setting(False)
- filter_p_value = Setting(0.01)
- filter_by_fdr = Setting(True)
- filter_fdr_value = Setting(0.2)
+ filter_by_p: bool = Setting(False)
+ filter_p_value: float = Setting(0.01)
+ filter_by_fdr: bool = Setting(True)
+ filter_fdr_value: float = Setting(0.2)
def __init__(self):
- super().__init__()
+ OWWidget.__init__(self)
+ ConcurrentWidgetMixin.__init__(self)
# Init data
self.data = None
self.selected_data = None
- self.selected_data_transformed = None # used for transforming the 'selected data' into the 'data' domain
-
- self.words = []
- self.p_values = []
- self.fdr_values = []
+ # used for transforming the 'selected data' into the 'data' domain
+ self.selected_data_transformed = None
- # Info section
- fbox = gui.widgetBox(self.controlArea, "Info")
- self.info_all = gui.label(fbox, self, 'Cluster words:')
- self.info_sel = gui.label(fbox, self, 'Selected words:')
- self.info_fil = gui.label(fbox, self, 'After filtering:')
+ self.results = Result()
# Filtering settings
fbox = gui.widgetBox(self.controlArea, "Filter")
@@ -68,7 +90,6 @@ def __init__(self):
self.spin_p = gui.doubleSpin(hbox, self, 'filter_p_value',
1e-4, 1, step=1e-4, labelWidth=15,
callback=self.filter_and_display,
- callbackOnReturn=True,
tooltip="Max p-value for word")
self.spin_p.setEnabled(self.filter_by_p)
@@ -79,7 +100,6 @@ def __init__(self):
self.spin_fdr = gui.doubleSpin(hbox, self, 'filter_fdr_value',
1e-4, 1, step=1e-4, labelWidth=15,
callback=self.filter_and_display,
- callbackOnReturn=True,
tooltip="Max p-value for word")
self.spin_fdr.setEnabled(self.filter_by_fdr)
gui.rubber(self.controlArea)
@@ -90,7 +110,7 @@ def __init__(self):
self.sig_words.setColumnCount(len(self.cols))
self.sig_words.setHeaderLabels(self.cols)
self.sig_words.setSortingEnabled(True)
- self.sig_words.setSelectionMode(QTreeView.ExtendedSelection)
+ self.sig_words.setSelectionMode(QTreeView.NoSelection)
self.sig_words.sortByColumn(2, 0) # 0 is ascending order
for i in range(len(self.cols)):
self.sig_words.resizeColumnToContents(i)
@@ -102,6 +122,8 @@ def sizeHint(self):
@Inputs.data
def set_data(self, data=None):
self.data = data
+ # selected data transformed depends on data domain
+ self.selected_data_transformed = None
@Inputs.selected_data
def set_data_selected(self, data=None):
@@ -135,7 +157,8 @@ def check_data(self):
self.clear()
return
self.data = Corpus.from_table(bow_domain, self.data)
- self.selected_data_transformed = Corpus.from_table(bow_domain, self.selected_data)
+ self.selected_data_transformed = Corpus.from_table(
+ bow_domain, self.selected_data)
if np_sp_sum(self.selected_data_transformed.X) == 0:
self.Error.no_words_overlap()
@@ -144,15 +167,15 @@ def check_data(self):
self.Error.all_selected()
self.clear()
else:
+ self.set_input_info()
self.apply()
else:
self.clear()
def clear(self):
self.sig_words.clear()
- self.info_all.setText('Cluster words:')
- self.info_sel.setText('Selected words:')
- self.info_fil.setText('After filtering:')
+ self.info.set_input_summary(self.info.NoInput)
+ self.info.set_output_summary(self.info.NoOutput)
def filter_enabled(self, b):
self.chb_p.setEnabled(b)
@@ -168,43 +191,58 @@ def filter_and_display(self):
if self.selected_data_transformed is None: # do nothing when no Data
return
- count = 0
- if self.words:
- for word, pval, fval in zip(self.words, self.p_values, self.fdr_values):
- if (not self.filter_by_p or pval <= self.filter_p_value) and \
- (not self.filter_by_fdr or fval <= self.filter_fdr_value):
- it = EATreeWidgetItem(word, pval, fval, self.sig_words)
- self.sig_words.addTopLevelItem(it)
- count += 1
+ if self.results.words:
+ count = self.build_tree()
+ else:
+ count = 0
for i in range(len(self.cols)):
self.sig_words.resizeColumnToContents(i)
+ self.set_output_info(count)
- self.info_all.setText('Cluster words: {}'.format(len(self.selected_data_transformed.domain.attributes)))
- self.info_sel.setText('Selected words: {}'.format(np.count_nonzero(np_sp_sum(self.selected_data_transformed.X, axis=0))))
- if not self.filter_by_p and not self.filter_by_fdr:
- self.info_fil.setText('After filtering:')
- self.info_fil.setEnabled(False)
- else:
- self.info_fil.setEnabled(True)
- self.info_fil.setText('After filtering: {}'.format(count))
-
- def progress(self, p):
- self.progressBarSet(p)
+ def build_tree(self) -> int:
+ count = 0
+ for word, pval, fval in zip(
+ self.results.words,
+ self.results.p_values,
+ self.results.fdr_values
+ ):
+ if ((not self.filter_by_p or pval <= self.filter_p_value) and
+ (not self.filter_by_fdr or fval <= self.filter_fdr_value)):
+ it = EATreeWidgetItem(word, pval, fval, self.sig_words)
+ self.sig_words.addTopLevelItem(it)
+ count += 1
+ return count
+
+ def set_input_info(self) -> None:
+ cluster_words = len(self.selected_data_transformed.domain.attributes)
+ selected_words = np.count_nonzero(np_sp_sum(
+ self.selected_data_transformed.X, axis=0))
+
+ self.info.set_input_summary(
+ f"{cluster_words}|{selected_words}",
+ f"Total words: {cluster_words}\n"
+ f"Words in subset: {selected_words}")
+
+ def set_output_info(self, count: int) -> None:
+ self.info.set_output_summary(str(count), f"{count} words displayed")
def apply(self):
- self.clear()
- self.progressBarInit()
+ self.sig_words.clear()
self.filter_enabled(False)
-
- self.words = [i.name for i in self.selected_data_transformed.domain.attributes]
- self.p_values = hypergeom_p_values(self.data.X,
- self.selected_data_transformed.X,
- callback=self.progress)
- self.fdr_values = FDR(self.p_values)
+ self.start(
+ Runner.run,
+ self.selected_data_transformed,
+ self.data,
+ self.results
+ )
+
+ def on_done(self, result: Result) -> None:
self.filter_and_display()
self.filter_enabled(True)
- self.progressBarFinished()
+
+ def on_exception(self, ex: Exception) -> None:
+ self.filter_enabled(True)
def tree_to_table(self):
view = [self.cols]
@@ -214,12 +252,13 @@ def tree_to_table(self):
for j in range(3):
line.append(self.sig_words.topLevelItem(i).text(j))
view.append(line)
- return(view)
+ return view
def send_report(self):
- if self.words:
+ if self.results.words:
self.report_table("Enriched words", self.tree_to_table())
+
fp = lambda score: "%0.5f" % score if score > 10e-3 else "%0.1e" % score
fpt = lambda score: "%0.9f" % score if score > 10e-3 else "%0.5e" % score
@@ -238,19 +277,13 @@ def __lt__(self, other):
col = self.treeWidget().sortColumn()
return self.data[col] < other.data[col]
-def main():
+
+if __name__ == '__main__':
+ from orangewidget.utils.widgetpreview import WidgetPreview
+ from orangecontrib.text.vectorization import BowVectorizer
corpus = Corpus.from_file('book-excerpts')
vect = BowVectorizer()
corpus_vect = vect.transform(corpus)
- app = QApplication([])
- widget = OWWordEnrichment()
- widget.set_data(corpus_vect)
- subset_corpus = corpus_vect[:10]
- widget.set_data_selected(subset_corpus)
- widget.handleNewSignals()
- widget.show()
- app.exec()
-
-if __name__ == '__main__':
- main()
\ No newline at end of file
+ WidgetPreview(OWWordEnrichment).run(
+ set_data_selected=corpus_vect[:10], set_data=corpus_vect)
From 58e36d58ce9aaf779b83a544abf8d87e8b203680 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Mon, 30 Dec 2019 14:29:47 +0100
Subject: [PATCH 2/2] Tests for Word Enrichment
---
.../text/widgets/owwordenrichment.py | 12 +-
.../widgets/tests/test_owwordenrichment.py | 204 ++++++++++++++++--
2 files changed, 197 insertions(+), 19 deletions(-)
diff --git a/orangecontrib/text/widgets/owwordenrichment.py b/orangecontrib/text/widgets/owwordenrichment.py
index 2c8fd633d..95e013ef4 100644
--- a/orangecontrib/text/widgets/owwordenrichment.py
+++ b/orangecontrib/text/widgets/owwordenrichment.py
@@ -80,6 +80,10 @@ def __init__(self):
self.results = Result()
+ # info box
+ fbox = gui.widgetBox(self.controlArea, "Info")
+ self.info_fil = gui.label(fbox, self, 'Words displayed: 0')
+
# Filtering settings
fbox = gui.widgetBox(self.controlArea, "Filter")
hbox = gui.widgetBox(fbox, orientation=0)
@@ -175,7 +179,7 @@ def check_data(self):
def clear(self):
self.sig_words.clear()
self.info.set_input_summary(self.info.NoInput)
- self.info.set_output_summary(self.info.NoOutput)
+ self.set_displayed_info(0)
def filter_enabled(self, b):
self.chb_p.setEnabled(b)
@@ -198,7 +202,7 @@ def filter_and_display(self):
for i in range(len(self.cols)):
self.sig_words.resizeColumnToContents(i)
- self.set_output_info(count)
+ self.set_displayed_info(count)
def build_tree(self) -> int:
count = 0
@@ -224,8 +228,8 @@ def set_input_info(self) -> None:
f"Total words: {cluster_words}\n"
f"Words in subset: {selected_words}")
- def set_output_info(self, count: int) -> None:
- self.info.set_output_summary(str(count), f"{count} words displayed")
+ def set_displayed_info(self, count: int) -> None:
+ self.info_fil.setText(f"Words displayed: {count}")
def apply(self):
self.sig_words.clear()
diff --git a/orangecontrib/text/widgets/tests/test_owwordenrichment.py b/orangecontrib/text/widgets/tests/test_owwordenrichment.py
index d55e925cf..955440186 100644
--- a/orangecontrib/text/widgets/tests/test_owwordenrichment.py
+++ b/orangecontrib/text/widgets/tests/test_owwordenrichment.py
@@ -1,5 +1,8 @@
import unittest
+from unittest.mock import Mock
+import Orange
+from Orange.data import Table, Domain
from Orange.widgets.tests.base import WidgetTest
from orangecontrib.text.corpus import Corpus
@@ -9,49 +12,50 @@
class TestWordEnrichment(WidgetTest):
def setUp(self):
- # type: OWWordEnrichment
self.widget = self.create_widget(OWWordEnrichment)
- self.corpus = Corpus.from_file('book-excerpts')
+ corpus = Corpus.from_file('book-excerpts')[::3]
vect = BowVectorizer()
- self.corpus_vect = vect.transform(self.corpus)
+ self.corpus_vect = vect.transform(corpus)
+ self.subset_corpus = self.corpus_vect[:5]
+ @unittest.skipIf(
+ Orange.__version__ < "3.24.0", "wait_until_finished not supported")
def test_filter_fdr(self):
widget = self.widget
- subset_corpus = self.corpus_vect[:10]
+
self.send_signal(widget.Inputs.data, self.corpus_vect)
- self.send_signal(widget.Inputs.selected_data, subset_corpus)
+ self.send_signal(widget.Inputs.selected_data, self.subset_corpus)
+ self.wait_until_finished(timeout=10000)
# test p-value filter
widget.filter_by_p = True
- widget.filter_p_value = 1e-9
+ widget.filter_p_value = 1e-3
widget.filter_by_fdr = False
widget.filter_fdr_value = 0.01
widget.filter_and_display()
self.assertEqual(widget.sig_words.topLevelItemCount(), 3)
self.assertEqual({widget.sig_words.topLevelItem(i).text(0)
- for i in (0, 1, 2)}, {'livesey', 'doctor', 'rum'})
+ for i in (0, 1, 2)}, {'livesey', 'jim', 'doctor'})
# test fdr filter
widget.filter_by_p = True
- widget.filter_p_value = 1e-4
+ widget.filter_p_value = 1e-1
widget.filter_by_fdr = True
- widget.filter_fdr_value = 1e-4
+ widget.filter_fdr_value = 0.9
widget.filter_and_display()
- self.assertEqual(widget.sig_words.topLevelItemCount(), 5)
- self.assertEqual({widget.sig_words.topLevelItem(i).text(0)
- for i in (0, 1, 2, 3, 4)},
- {'livesey', 'doctor', 'rum', 'admiral', 'inn'})
+ self.assertEqual(widget.sig_words.topLevelItemCount(), 1)
+ self.assertEqual(widget.sig_words.topLevelItem(0).text(0), "doctor")
# test if different when fdr false
widget.filter_by_p = True
- widget.filter_p_value = 1e-4
+ widget.filter_p_value = 1e-1
widget.filter_by_fdr = False
widget.filter_fdr_value = 1e-4
widget.filter_and_display()
- self.assertEqual(widget.sig_words.topLevelItemCount(), 16)
+ self.assertEqual(widget.sig_words.topLevelItemCount(), 108)
# test no results
widget.filter_by_p = True
@@ -62,6 +66,176 @@ def test_filter_fdr(self):
widget.filter_and_display()
self.assertEqual(widget.sig_words.topLevelItemCount(), 0)
+ self.send_signal(widget.Inputs.data, None)
+ widget.filter_and_display()
+ self.assertEqual(widget.sig_words.topLevelItemCount(), 0)
+
+ def test_empty_selection(self):
+ w = self.widget
+
+ # empty selection
+ self.send_signal(w.Inputs.data, self.corpus_vect)
+ self.send_signal(w.Inputs.selected_data, self.subset_corpus[:0])
+ self.assertTrue(self.widget.Error.empty_selection.is_shown())
+
+ # when commands changed on non-valid data
+ w.controls.filter_by_p.click()
+
+ # selection not empty
+ self.send_signal(w.Inputs.selected_data, self.subset_corpus)
+ self.assertFalse(self.widget.Error.empty_selection.is_shown())
+
+ def test_no_bow_features(self):
+ w = self.widget
+
+ iris = Table("iris")
+ self.send_signal(w.Inputs.data, iris)
+ self.send_signal(w.Inputs.selected_data, iris[:10])
+ self.assertTrue(self.widget.Error.no_bow_features.is_shown())
+
+ # when commands changed on non-valid data
+ w.controls.filter_by_p.click()
+
+ self.send_signal(w.Inputs.data, None)
+ self.send_signal(w.Inputs.selected_data, None)
+ self.assertFalse(self.widget.Error.no_bow_features.is_shown())
+
+ def test_all_selected(self):
+ w = self.widget
+
+ self.send_signal(w.Inputs.data, self.corpus_vect)
+ self.send_signal(w.Inputs.selected_data, self.corpus_vect)
+ self.assertTrue(self.widget.Error.all_selected.is_shown())
+
+ # when commands changed on non-valid data
+ w.controls.filter_by_p.click()
+
+ self.send_signal(w.Inputs.data, None)
+ self.send_signal(w.Inputs.selected_data, None)
+ self.assertFalse(self.widget.Error.all_selected.is_shown())
+
+ def test_no_overlapping(self):
+ w = self.widget
+
+ # with one column bow it is easier
+ corpus_vect = Corpus.from_table(Domain(
+ self.corpus_vect.domain.attributes[:1],
+ self.corpus_vect.domain.class_var,
+ self.corpus_vect.domain.metas
+ ), self.corpus_vect)
+
+ self.send_signal(w.Inputs.data, corpus_vect[10:15])
+ self.send_signal(w.Inputs.selected_data, corpus_vect[4:5])
+ self.assertTrue(self.widget.Error.no_words_overlap.is_shown())
+
+ # when commands changed on non-valid data
+ w.controls.filter_by_p.click()
+
+ self.send_signal(w.Inputs.selected_data, self.subset_corpus)
+ self.send_signal(w.Inputs.data, self.corpus_vect)
+ self.assertFalse(self.widget.Error.no_words_overlap.is_shown())
+
+ @unittest.skipIf(
+ Orange.__version__ < "3.24.0", "wait_until_finished not supported")
+ def test_input_info(self):
+ w = self.widget
+ input_sum = w.info.set_input_summary = Mock()
+
+ self.send_signal(w.Inputs.selected_data, self.subset_corpus)
+ self.send_signal(w.Inputs.data, self.corpus_vect)
+
+ input_sum.assert_called_with(
+ "5923|1204", "Total words: 5923\nWords in subset: 1204")
+
+ self.wait_until_stop_blocking()
+ self.send_signal(w.Inputs.selected_data, None)
+ self.send_signal(w.Inputs.data, None)
+ input_sum.assert_called_with(w.info.NoInput)
+
+ @unittest.skipIf(
+ Orange.__version__ < "3.24.0", "wait_until_finished not supported")
+ def test_output_info(self):
+ w = self.widget
+ w.filter_p_value = 1e-3
+ w.filter_by_p = True
+ w.filter_by_fdr = False
+
+ self.send_signal(w.Inputs.selected_data, self.subset_corpus)
+ self.send_signal(w.Inputs.data, self.corpus_vect)
+ self.wait_until_finished(timeout=10000)
+
+ self.assertEqual(w.info_fil.text(), "Words displayed: 3")
+
+ # test fdr filter
+ w.filter_by_p = True
+ w.filter_p_value = 1e-4
+ w.filter_by_fdr = True
+ w.filter_fdr_value = 1e-4
+ w.filter_and_display()
+ self.assertEqual(w.info_fil.text(), "Words displayed: 0")
+
+ self.send_signal(w.Inputs.selected_data, None)
+ self.assertEqual(w.info_fil.text(), "Words displayed: 0")
+ self.send_signal(w.Inputs.data, None)
+ self.assertEqual(w.info_fil.text(), "Words displayed: 0")
+
+ @unittest.skipIf(
+ Orange.__version__ < "3.24.0", "wait_until_finished not supported")
+ def test_filter_changed(self):
+ """
+ This case tests whether function are correctly triggered when
+ values in filter field changes
+ """
+ w = self.widget
+
+ self.send_signal(w.Inputs.data, self.corpus_vect)
+ self.send_signal(w.Inputs.selected_data, self.subset_corpus)
+ self.wait_until_finished(timeout=10000)
+
+ # test p-value filter
+ w.controls.filter_by_p.click() # set to true
+ w.controls.filter_p_value.valueChanged.emit(1e-3)
+ w.controls.filter_by_fdr.click() # set to false
+ w.controls.filter_fdr_value.valueChanged.emit(0.1)
+
+ self.assertEqual(w.sig_words.topLevelItemCount(), 3)
+ self.assertEqual({w.sig_words.topLevelItem(i).text(0)
+ for i in (0, 1, 2)}, {'livesey', 'jim', 'doctor'})
+
+ # # test fdr filter
+ w.controls.filter_p_value.valueChanged.emit(1e-1)
+ w.controls.filter_by_fdr.click() # set to True
+ w.controls.filter_fdr_value.valueChanged.emit(0.9)
+
+ self.assertEqual(w.sig_words.topLevelItemCount(), 1)
+ self.assertEqual(w.sig_words.topLevelItem(0).text(0), "doctor")
+
+ # test if different when fdr false
+ w.controls.filter_by_fdr.click() # set to False
+
+ self.assertEqual(w.sig_words.topLevelItemCount(), 108)
+
+ # # # test no results
+ w.controls.filter_p_value.valueChanged.emit(1e-11)
+
+ self.assertEqual(w.sig_words.topLevelItemCount(), 0)
+
+ @unittest.skipIf(
+ Orange.__version__ < "3.24.0", "wait_until_finished not supported")
+ def test_report(self):
+ """
+ Just test if report works.
+ """
+ w = self.widget
+
+ w.send_report()
+
+ self.send_signal(w.Inputs.data, self.corpus_vect)
+ self.send_signal(w.Inputs.selected_data, self.subset_corpus)
+ self.wait_until_finished(timeout=10000)
+
+ w.send_report()
+
if __name__ == "__main__":
unittest.main()