From adde63cfcd4666bf022b42c2e85591253fc6a326 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Mon, 8 Mar 2021 12:27:12 +0100 Subject: [PATCH 1/2] Corpus: make sure variables are initialized in from_list and from_numpy --- orangecontrib/text/corpus.py | 14 +++++++++---- orangecontrib/text/tests/test_corpus.py | 26 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py index 8eace849f..4619d931e 100644 --- a/orangecontrib/text/corpus.py +++ b/orangecontrib/text/corpus.py @@ -554,14 +554,20 @@ def from_table(cls, domain, source, row_indices=...): @classmethod def from_numpy(cls, *args, **kwargs): - c = super().from_numpy(*args, **kwargs) - c._set_unique_titles() + t = super().from_numpy(*args, **kwargs) + # t is corpus but its constructor was not called since from_numpy + # calls just class method __new__, call it here to set default values + # for attributes such as _titles, _tokens, preprocessors, text_features + c = Corpus(t.domain, t.X, t.Y, t.metas, t.W, ids=t.ids) return c @classmethod def from_list(cls, domain, rows, weights=None): - c = super().from_list(domain, rows, weights) - c._set_unique_titles() + t = super().from_list(domain, rows, weights) + # t is corpus but its constructor was not called since from_numpy + # calls just class method __new__, call it here to set default values + # for attributes such as _titles, _tokens, preprocessors, text_features + c = Corpus(t.domain, t.X, t.Y, t.metas, t.W, ids=t.ids) return c @classmethod diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py index 11e86bb6f..79f1e1672 100644 --- a/orangecontrib/text/tests/test_corpus.py +++ b/orangecontrib/text/tests/test_corpus.py @@ -53,6 +53,32 @@ def test_corpus_from_file_with_tab(self): c2 = Corpus.from_file('book-excerpts.tab') self.assertEqual(c, c2) + def test_corpus_from_numpy(self): + domain = Domain( + [], metas=[StringVariable("title"), StringVariable("a")] + ) + corpus = Corpus.from_numpy( + domain, + np.empty((2, 0)), + metas=np.array([["title1", "a"], ["title2", "b"]]) + ) + self.assertEqual(2, len(corpus)) + assert_array_equal(["Document 1", "Document 2"], corpus.titles) + self.assertListEqual([StringVariable("title")], corpus.text_features) + self.assertIsNone(corpus._tokens) + self.assertListEqual([], corpus.used_preprocessor.preprocessors) + + def test_corpus_from_list(self): + domain = Domain( + [], metas=[StringVariable("title"), StringVariable("a")] + ) + corpus = Corpus.from_list(domain, [["title1", "a"], ["title2", "b"]]) + self.assertEqual(2, len(corpus)) + assert_array_equal(["Document 1", "Document 2"], corpus.titles) + self.assertListEqual([StringVariable("title")], corpus.text_features) + self.assertIsNone(corpus._tokens) + self.assertListEqual([], corpus.used_preprocessor.preprocessors) + def test_corpus_from_file_missing(self): with self.assertRaises(FileNotFoundError): Corpus.from_file('missing_file') From a1e87a1b167fac2260ad4644f8e7e4c9fd02b72b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Mon, 8 Mar 2021 12:28:12 +0100 Subject: [PATCH 2/2] Corpus: show error when no string variables at all not when text_features empty --- orangecontrib/text/widgets/owcorpus.py | 8 +++-- .../text/widgets/tests/test_owcorpus.py | 36 ++++++++++++++++++- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py index cd56ee24f..2d4aa6a4f 100644 --- a/orangecontrib/text/widgets/owcorpus.py +++ b/orangecontrib/text/widgets/owcorpus.py @@ -82,7 +82,7 @@ def __init__(self): # Used Text Features fbox = gui.widgetBox(self.controlArea, orientation=0) - ubox = gui.widgetBox(fbox, "Used text features", addSpace=False) + ubox = gui.widgetBox(fbox, "Used text features") self.used_attrs_model = VariableListModel(enable_dnd=True) self.used_attrs_view = VariablesListItemView() self.used_attrs_view.setModel(self.used_attrs_model) @@ -94,7 +94,7 @@ def __init__(self): aa.rowsRemoved.connect(self.update_feature_selection) # Ignored Text Features - ibox = gui.widgetBox(fbox, "Ignored text features", addSpace=False) + ibox = gui.widgetBox(fbox, "Ignored text features") self.unused_attrs_model = VariableListModel(enable_dnd=True) self.unused_attrs_view = VariablesListItemView() self.unused_attrs_view.setModel(self.unused_attrs_model) @@ -146,6 +146,7 @@ def _load_corpus(path: str, data: Table, state: TaskState) -> Corpus: def open_file(self, path=None, data=None): self.closeContext() self.Error.clear() + self.cancel() self.unused_attrs_model[:] = [] self.used_attrs_model[:] = [] self.start(self._load_corpus, path, data) @@ -158,7 +159,8 @@ def on_done(self, corpus: Corpus) -> None: self.update_output_info() self._setup_title_dropdown() self.used_attrs = list(self.corpus.text_features) - if not self.corpus.text_features: + all_str_features = [f for f in self.corpus.domain.metas if f.is_string] + if not all_str_features: self.Error.corpus_without_text_features() self.Outputs.corpus.send(None) return diff --git a/orangecontrib/text/widgets/tests/test_owcorpus.py b/orangecontrib/text/widgets/tests/test_owcorpus.py index 0896f9893..5ac4c8faa 100644 --- a/orangecontrib/text/widgets/tests/test_owcorpus.py +++ b/orangecontrib/text/widgets/tests/test_owcorpus.py @@ -11,7 +11,7 @@ class TestOWCorpus(WidgetTest): def setUp(self): - self.widget = self.create_widget(OWCorpus) + self.widget: OWCorpus = self.create_widget(OWCorpus) def check_output(self, sel_title): """ @@ -286,6 +286,40 @@ def test_keep_selected_variables(self): self.wait_until_finished() self.assertListEqual(list(prew_selected), self.widget.used_attrs) + def test_no_text_feature(self): + """ + Test with data which have empty text_features. Widget should not show + the error but, should have all features unused. + """ + # widget already loads book-excerpts from file and store context + # settings this call restore context settings to default otherwise + # Text variable is moved to used_attributes by the context + self.widget.settingsHandler.reset_to_original(self.widget) + data = Corpus.from_file("book-excerpts") + data.text_features = [] + self.send_signal(self.widget.Inputs.data, data) + self.wait_until_finished() + self.assertFalse( + self.widget.Error.corpus_without_text_features.is_shown() + ) + self.assertEqual(0, len(list(self.widget.used_attrs_model))) + self.assertListEqual( + [data.domain["Text"]], + list(self.widget.unused_attrs_model) + ) + + def test_corpus_without_text_features(self): + """ + Test if corpus_without_text_features is correctly raised for data + without text features + """ + data = Table("iris") + self.send_signal(self.widget.Inputs.data, data) + self.wait_until_finished() + self.assertTrue( + self.widget.Error.corpus_without_text_features.is_shown() + ) + if __name__ == "__main__": unittest.main() \ No newline at end of file