From cf96f20e7bbc15ddecf01e6e1c6efae3bdd35da5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Mon, 30 Dec 2019 14:29:32 +0100 Subject: [PATCH 1/2] Word Enrichment: Added ConcurrentWidgetMixin to compute in another thread --- .../text/widgets/owwordenrichment.py | 169 +++++++++++------- 1 file changed, 101 insertions(+), 68 deletions(-) diff --git a/orangecontrib/text/widgets/owwordenrichment.py b/orangecontrib/text/widgets/owwordenrichment.py index 92dfcf1e5..2c8fd633d 100644 --- a/orangecontrib/text/widgets/owwordenrichment.py +++ b/orangecontrib/text/widgets/owwordenrichment.py @@ -1,20 +1,48 @@ +from types import SimpleNamespace +from typing import List, Optional, Any + import numpy as np -from AnyQt.QtWidgets import QTreeWidget, QTreeView, QTreeWidgetItem, \ - QApplication +from AnyQt.QtWidgets import QTreeWidget, QTreeView, QTreeWidgetItem from Orange.data import Table, Domain from Orange.widgets import gui from Orange.widgets.settings import Setting +from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState from Orange.widgets.widget import OWWidget, Msg, Input from Orange.statistics.util import FDR from PyQt5.QtCore import QSize from orangecontrib.text import Corpus from orangecontrib.text.util import np_sp_sum from orangecontrib.text.stats import hypergeom_p_values -from orangecontrib.text.vectorization import BowVectorizer -class OWWordEnrichment(OWWidget): +class Result(SimpleNamespace): + words: Optional[List[str]] = None + p_values: Optional[List[float]] = None + fdr_values: Optional[List[float]] = None + + +class Runner: + @staticmethod + def run( + selected_data_transformed: Table, + data: Table, + result: Result, + state: TaskState + ) -> None: + state.set_status("Listing words") + result.words = [ + i.name for i in selected_data_transformed.domain.attributes] + state.set_status("Computing p-values") + result.p_values = hypergeom_p_values( + data.X, selected_data_transformed.X, + callback=state.set_progress_value + ) + state.set_status("Computing FDR values") + result.fdr_values = FDR(result.p_values) + + +class OWWordEnrichment(OWWidget, ConcurrentWidgetMixin): # Basic widget info name = "Word Enrichment" description = "Word enrichment analysis for selected documents." @@ -35,28 +63,22 @@ class Error(OWWidget.Error): all_selected = Msg('All examples can not be selected!') # Settings - filter_by_p = Setting(False) - filter_p_value = Setting(0.01) - filter_by_fdr = Setting(True) - filter_fdr_value = Setting(0.2) + filter_by_p: bool = Setting(False) + filter_p_value: float = Setting(0.01) + filter_by_fdr: bool = Setting(True) + filter_fdr_value: float = Setting(0.2) def __init__(self): - super().__init__() + OWWidget.__init__(self) + ConcurrentWidgetMixin.__init__(self) # Init data self.data = None self.selected_data = None - self.selected_data_transformed = None # used for transforming the 'selected data' into the 'data' domain - - self.words = [] - self.p_values = [] - self.fdr_values = [] + # used for transforming the 'selected data' into the 'data' domain + self.selected_data_transformed = None - # Info section - fbox = gui.widgetBox(self.controlArea, "Info") - self.info_all = gui.label(fbox, self, 'Cluster words:') - self.info_sel = gui.label(fbox, self, 'Selected words:') - self.info_fil = gui.label(fbox, self, 'After filtering:') + self.results = Result() # Filtering settings fbox = gui.widgetBox(self.controlArea, "Filter") @@ -68,7 +90,6 @@ def __init__(self): self.spin_p = gui.doubleSpin(hbox, self, 'filter_p_value', 1e-4, 1, step=1e-4, labelWidth=15, callback=self.filter_and_display, - callbackOnReturn=True, tooltip="Max p-value for word") self.spin_p.setEnabled(self.filter_by_p) @@ -79,7 +100,6 @@ def __init__(self): self.spin_fdr = gui.doubleSpin(hbox, self, 'filter_fdr_value', 1e-4, 1, step=1e-4, labelWidth=15, callback=self.filter_and_display, - callbackOnReturn=True, tooltip="Max p-value for word") self.spin_fdr.setEnabled(self.filter_by_fdr) gui.rubber(self.controlArea) @@ -90,7 +110,7 @@ def __init__(self): self.sig_words.setColumnCount(len(self.cols)) self.sig_words.setHeaderLabels(self.cols) self.sig_words.setSortingEnabled(True) - self.sig_words.setSelectionMode(QTreeView.ExtendedSelection) + self.sig_words.setSelectionMode(QTreeView.NoSelection) self.sig_words.sortByColumn(2, 0) # 0 is ascending order for i in range(len(self.cols)): self.sig_words.resizeColumnToContents(i) @@ -102,6 +122,8 @@ def sizeHint(self): @Inputs.data def set_data(self, data=None): self.data = data + # selected data transformed depends on data domain + self.selected_data_transformed = None @Inputs.selected_data def set_data_selected(self, data=None): @@ -135,7 +157,8 @@ def check_data(self): self.clear() return self.data = Corpus.from_table(bow_domain, self.data) - self.selected_data_transformed = Corpus.from_table(bow_domain, self.selected_data) + self.selected_data_transformed = Corpus.from_table( + bow_domain, self.selected_data) if np_sp_sum(self.selected_data_transformed.X) == 0: self.Error.no_words_overlap() @@ -144,15 +167,15 @@ def check_data(self): self.Error.all_selected() self.clear() else: + self.set_input_info() self.apply() else: self.clear() def clear(self): self.sig_words.clear() - self.info_all.setText('Cluster words:') - self.info_sel.setText('Selected words:') - self.info_fil.setText('After filtering:') + self.info.set_input_summary(self.info.NoInput) + self.info.set_output_summary(self.info.NoOutput) def filter_enabled(self, b): self.chb_p.setEnabled(b) @@ -168,43 +191,58 @@ def filter_and_display(self): if self.selected_data_transformed is None: # do nothing when no Data return - count = 0 - if self.words: - for word, pval, fval in zip(self.words, self.p_values, self.fdr_values): - if (not self.filter_by_p or pval <= self.filter_p_value) and \ - (not self.filter_by_fdr or fval <= self.filter_fdr_value): - it = EATreeWidgetItem(word, pval, fval, self.sig_words) - self.sig_words.addTopLevelItem(it) - count += 1 + if self.results.words: + count = self.build_tree() + else: + count = 0 for i in range(len(self.cols)): self.sig_words.resizeColumnToContents(i) + self.set_output_info(count) - self.info_all.setText('Cluster words: {}'.format(len(self.selected_data_transformed.domain.attributes))) - self.info_sel.setText('Selected words: {}'.format(np.count_nonzero(np_sp_sum(self.selected_data_transformed.X, axis=0)))) - if not self.filter_by_p and not self.filter_by_fdr: - self.info_fil.setText('After filtering:') - self.info_fil.setEnabled(False) - else: - self.info_fil.setEnabled(True) - self.info_fil.setText('After filtering: {}'.format(count)) - - def progress(self, p): - self.progressBarSet(p) + def build_tree(self) -> int: + count = 0 + for word, pval, fval in zip( + self.results.words, + self.results.p_values, + self.results.fdr_values + ): + if ((not self.filter_by_p or pval <= self.filter_p_value) and + (not self.filter_by_fdr or fval <= self.filter_fdr_value)): + it = EATreeWidgetItem(word, pval, fval, self.sig_words) + self.sig_words.addTopLevelItem(it) + count += 1 + return count + + def set_input_info(self) -> None: + cluster_words = len(self.selected_data_transformed.domain.attributes) + selected_words = np.count_nonzero(np_sp_sum( + self.selected_data_transformed.X, axis=0)) + + self.info.set_input_summary( + f"{cluster_words}|{selected_words}", + f"Total words: {cluster_words}\n" + f"Words in subset: {selected_words}") + + def set_output_info(self, count: int) -> None: + self.info.set_output_summary(str(count), f"{count} words displayed") def apply(self): - self.clear() - self.progressBarInit() + self.sig_words.clear() self.filter_enabled(False) - - self.words = [i.name for i in self.selected_data_transformed.domain.attributes] - self.p_values = hypergeom_p_values(self.data.X, - self.selected_data_transformed.X, - callback=self.progress) - self.fdr_values = FDR(self.p_values) + self.start( + Runner.run, + self.selected_data_transformed, + self.data, + self.results + ) + + def on_done(self, result: Result) -> None: self.filter_and_display() self.filter_enabled(True) - self.progressBarFinished() + + def on_exception(self, ex: Exception) -> None: + self.filter_enabled(True) def tree_to_table(self): view = [self.cols] @@ -214,12 +252,13 @@ def tree_to_table(self): for j in range(3): line.append(self.sig_words.topLevelItem(i).text(j)) view.append(line) - return(view) + return view def send_report(self): - if self.words: + if self.results.words: self.report_table("Enriched words", self.tree_to_table()) + fp = lambda score: "%0.5f" % score if score > 10e-3 else "%0.1e" % score fpt = lambda score: "%0.9f" % score if score > 10e-3 else "%0.5e" % score @@ -238,19 +277,13 @@ def __lt__(self, other): col = self.treeWidget().sortColumn() return self.data[col] < other.data[col] -def main(): + +if __name__ == '__main__': + from orangewidget.utils.widgetpreview import WidgetPreview + from orangecontrib.text.vectorization import BowVectorizer corpus = Corpus.from_file('book-excerpts') vect = BowVectorizer() corpus_vect = vect.transform(corpus) - app = QApplication([]) - widget = OWWordEnrichment() - widget.set_data(corpus_vect) - subset_corpus = corpus_vect[:10] - widget.set_data_selected(subset_corpus) - widget.handleNewSignals() - widget.show() - app.exec() - -if __name__ == '__main__': - main() \ No newline at end of file + WidgetPreview(OWWordEnrichment).run( + set_data_selected=corpus_vect[:10], set_data=corpus_vect) From 58e36d58ce9aaf779b83a544abf8d87e8b203680 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Mon, 30 Dec 2019 14:29:47 +0100 Subject: [PATCH 2/2] Tests for Word Enrichment --- .../text/widgets/owwordenrichment.py | 12 +- .../widgets/tests/test_owwordenrichment.py | 204 ++++++++++++++++-- 2 files changed, 197 insertions(+), 19 deletions(-) diff --git a/orangecontrib/text/widgets/owwordenrichment.py b/orangecontrib/text/widgets/owwordenrichment.py index 2c8fd633d..95e013ef4 100644 --- a/orangecontrib/text/widgets/owwordenrichment.py +++ b/orangecontrib/text/widgets/owwordenrichment.py @@ -80,6 +80,10 @@ def __init__(self): self.results = Result() + # info box + fbox = gui.widgetBox(self.controlArea, "Info") + self.info_fil = gui.label(fbox, self, 'Words displayed: 0') + # Filtering settings fbox = gui.widgetBox(self.controlArea, "Filter") hbox = gui.widgetBox(fbox, orientation=0) @@ -175,7 +179,7 @@ def check_data(self): def clear(self): self.sig_words.clear() self.info.set_input_summary(self.info.NoInput) - self.info.set_output_summary(self.info.NoOutput) + self.set_displayed_info(0) def filter_enabled(self, b): self.chb_p.setEnabled(b) @@ -198,7 +202,7 @@ def filter_and_display(self): for i in range(len(self.cols)): self.sig_words.resizeColumnToContents(i) - self.set_output_info(count) + self.set_displayed_info(count) def build_tree(self) -> int: count = 0 @@ -224,8 +228,8 @@ def set_input_info(self) -> None: f"Total words: {cluster_words}\n" f"Words in subset: {selected_words}") - def set_output_info(self, count: int) -> None: - self.info.set_output_summary(str(count), f"{count} words displayed") + def set_displayed_info(self, count: int) -> None: + self.info_fil.setText(f"Words displayed: {count}") def apply(self): self.sig_words.clear() diff --git a/orangecontrib/text/widgets/tests/test_owwordenrichment.py b/orangecontrib/text/widgets/tests/test_owwordenrichment.py index d55e925cf..955440186 100644 --- a/orangecontrib/text/widgets/tests/test_owwordenrichment.py +++ b/orangecontrib/text/widgets/tests/test_owwordenrichment.py @@ -1,5 +1,8 @@ import unittest +from unittest.mock import Mock +import Orange +from Orange.data import Table, Domain from Orange.widgets.tests.base import WidgetTest from orangecontrib.text.corpus import Corpus @@ -9,49 +12,50 @@ class TestWordEnrichment(WidgetTest): def setUp(self): - # type: OWWordEnrichment self.widget = self.create_widget(OWWordEnrichment) - self.corpus = Corpus.from_file('book-excerpts') + corpus = Corpus.from_file('book-excerpts')[::3] vect = BowVectorizer() - self.corpus_vect = vect.transform(self.corpus) + self.corpus_vect = vect.transform(corpus) + self.subset_corpus = self.corpus_vect[:5] + @unittest.skipIf( + Orange.__version__ < "3.24.0", "wait_until_finished not supported") def test_filter_fdr(self): widget = self.widget - subset_corpus = self.corpus_vect[:10] + self.send_signal(widget.Inputs.data, self.corpus_vect) - self.send_signal(widget.Inputs.selected_data, subset_corpus) + self.send_signal(widget.Inputs.selected_data, self.subset_corpus) + self.wait_until_finished(timeout=10000) # test p-value filter widget.filter_by_p = True - widget.filter_p_value = 1e-9 + widget.filter_p_value = 1e-3 widget.filter_by_fdr = False widget.filter_fdr_value = 0.01 widget.filter_and_display() self.assertEqual(widget.sig_words.topLevelItemCount(), 3) self.assertEqual({widget.sig_words.topLevelItem(i).text(0) - for i in (0, 1, 2)}, {'livesey', 'doctor', 'rum'}) + for i in (0, 1, 2)}, {'livesey', 'jim', 'doctor'}) # test fdr filter widget.filter_by_p = True - widget.filter_p_value = 1e-4 + widget.filter_p_value = 1e-1 widget.filter_by_fdr = True - widget.filter_fdr_value = 1e-4 + widget.filter_fdr_value = 0.9 widget.filter_and_display() - self.assertEqual(widget.sig_words.topLevelItemCount(), 5) - self.assertEqual({widget.sig_words.topLevelItem(i).text(0) - for i in (0, 1, 2, 3, 4)}, - {'livesey', 'doctor', 'rum', 'admiral', 'inn'}) + self.assertEqual(widget.sig_words.topLevelItemCount(), 1) + self.assertEqual(widget.sig_words.topLevelItem(0).text(0), "doctor") # test if different when fdr false widget.filter_by_p = True - widget.filter_p_value = 1e-4 + widget.filter_p_value = 1e-1 widget.filter_by_fdr = False widget.filter_fdr_value = 1e-4 widget.filter_and_display() - self.assertEqual(widget.sig_words.topLevelItemCount(), 16) + self.assertEqual(widget.sig_words.topLevelItemCount(), 108) # test no results widget.filter_by_p = True @@ -62,6 +66,176 @@ def test_filter_fdr(self): widget.filter_and_display() self.assertEqual(widget.sig_words.topLevelItemCount(), 0) + self.send_signal(widget.Inputs.data, None) + widget.filter_and_display() + self.assertEqual(widget.sig_words.topLevelItemCount(), 0) + + def test_empty_selection(self): + w = self.widget + + # empty selection + self.send_signal(w.Inputs.data, self.corpus_vect) + self.send_signal(w.Inputs.selected_data, self.subset_corpus[:0]) + self.assertTrue(self.widget.Error.empty_selection.is_shown()) + + # when commands changed on non-valid data + w.controls.filter_by_p.click() + + # selection not empty + self.send_signal(w.Inputs.selected_data, self.subset_corpus) + self.assertFalse(self.widget.Error.empty_selection.is_shown()) + + def test_no_bow_features(self): + w = self.widget + + iris = Table("iris") + self.send_signal(w.Inputs.data, iris) + self.send_signal(w.Inputs.selected_data, iris[:10]) + self.assertTrue(self.widget.Error.no_bow_features.is_shown()) + + # when commands changed on non-valid data + w.controls.filter_by_p.click() + + self.send_signal(w.Inputs.data, None) + self.send_signal(w.Inputs.selected_data, None) + self.assertFalse(self.widget.Error.no_bow_features.is_shown()) + + def test_all_selected(self): + w = self.widget + + self.send_signal(w.Inputs.data, self.corpus_vect) + self.send_signal(w.Inputs.selected_data, self.corpus_vect) + self.assertTrue(self.widget.Error.all_selected.is_shown()) + + # when commands changed on non-valid data + w.controls.filter_by_p.click() + + self.send_signal(w.Inputs.data, None) + self.send_signal(w.Inputs.selected_data, None) + self.assertFalse(self.widget.Error.all_selected.is_shown()) + + def test_no_overlapping(self): + w = self.widget + + # with one column bow it is easier + corpus_vect = Corpus.from_table(Domain( + self.corpus_vect.domain.attributes[:1], + self.corpus_vect.domain.class_var, + self.corpus_vect.domain.metas + ), self.corpus_vect) + + self.send_signal(w.Inputs.data, corpus_vect[10:15]) + self.send_signal(w.Inputs.selected_data, corpus_vect[4:5]) + self.assertTrue(self.widget.Error.no_words_overlap.is_shown()) + + # when commands changed on non-valid data + w.controls.filter_by_p.click() + + self.send_signal(w.Inputs.selected_data, self.subset_corpus) + self.send_signal(w.Inputs.data, self.corpus_vect) + self.assertFalse(self.widget.Error.no_words_overlap.is_shown()) + + @unittest.skipIf( + Orange.__version__ < "3.24.0", "wait_until_finished not supported") + def test_input_info(self): + w = self.widget + input_sum = w.info.set_input_summary = Mock() + + self.send_signal(w.Inputs.selected_data, self.subset_corpus) + self.send_signal(w.Inputs.data, self.corpus_vect) + + input_sum.assert_called_with( + "5923|1204", "Total words: 5923\nWords in subset: 1204") + + self.wait_until_stop_blocking() + self.send_signal(w.Inputs.selected_data, None) + self.send_signal(w.Inputs.data, None) + input_sum.assert_called_with(w.info.NoInput) + + @unittest.skipIf( + Orange.__version__ < "3.24.0", "wait_until_finished not supported") + def test_output_info(self): + w = self.widget + w.filter_p_value = 1e-3 + w.filter_by_p = True + w.filter_by_fdr = False + + self.send_signal(w.Inputs.selected_data, self.subset_corpus) + self.send_signal(w.Inputs.data, self.corpus_vect) + self.wait_until_finished(timeout=10000) + + self.assertEqual(w.info_fil.text(), "Words displayed: 3") + + # test fdr filter + w.filter_by_p = True + w.filter_p_value = 1e-4 + w.filter_by_fdr = True + w.filter_fdr_value = 1e-4 + w.filter_and_display() + self.assertEqual(w.info_fil.text(), "Words displayed: 0") + + self.send_signal(w.Inputs.selected_data, None) + self.assertEqual(w.info_fil.text(), "Words displayed: 0") + self.send_signal(w.Inputs.data, None) + self.assertEqual(w.info_fil.text(), "Words displayed: 0") + + @unittest.skipIf( + Orange.__version__ < "3.24.0", "wait_until_finished not supported") + def test_filter_changed(self): + """ + This case tests whether function are correctly triggered when + values in filter field changes + """ + w = self.widget + + self.send_signal(w.Inputs.data, self.corpus_vect) + self.send_signal(w.Inputs.selected_data, self.subset_corpus) + self.wait_until_finished(timeout=10000) + + # test p-value filter + w.controls.filter_by_p.click() # set to true + w.controls.filter_p_value.valueChanged.emit(1e-3) + w.controls.filter_by_fdr.click() # set to false + w.controls.filter_fdr_value.valueChanged.emit(0.1) + + self.assertEqual(w.sig_words.topLevelItemCount(), 3) + self.assertEqual({w.sig_words.topLevelItem(i).text(0) + for i in (0, 1, 2)}, {'livesey', 'jim', 'doctor'}) + + # # test fdr filter + w.controls.filter_p_value.valueChanged.emit(1e-1) + w.controls.filter_by_fdr.click() # set to True + w.controls.filter_fdr_value.valueChanged.emit(0.9) + + self.assertEqual(w.sig_words.topLevelItemCount(), 1) + self.assertEqual(w.sig_words.topLevelItem(0).text(0), "doctor") + + # test if different when fdr false + w.controls.filter_by_fdr.click() # set to False + + self.assertEqual(w.sig_words.topLevelItemCount(), 108) + + # # # test no results + w.controls.filter_p_value.valueChanged.emit(1e-11) + + self.assertEqual(w.sig_words.topLevelItemCount(), 0) + + @unittest.skipIf( + Orange.__version__ < "3.24.0", "wait_until_finished not supported") + def test_report(self): + """ + Just test if report works. + """ + w = self.widget + + w.send_report() + + self.send_signal(w.Inputs.data, self.corpus_vect) + self.send_signal(w.Inputs.selected_data, self.subset_corpus) + self.wait_until_finished(timeout=10000) + + w.send_report() + if __name__ == "__main__": unittest.main()