From adde63cfcd4666bf022b42c2e85591253fc6a326 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Mon, 8 Mar 2021 12:27:12 +0100
Subject: [PATCH 1/2] Corpus: make sure variables are initialized in from_list
and from_numpy
---
orangecontrib/text/corpus.py | 14 +++++++++----
orangecontrib/text/tests/test_corpus.py | 26 +++++++++++++++++++++++++
2 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py
index 8eace849f..4619d931e 100644
--- a/orangecontrib/text/corpus.py
+++ b/orangecontrib/text/corpus.py
@@ -554,14 +554,20 @@ def from_table(cls, domain, source, row_indices=...):
@classmethod
def from_numpy(cls, *args, **kwargs):
- c = super().from_numpy(*args, **kwargs)
- c._set_unique_titles()
+ t = super().from_numpy(*args, **kwargs)
+ # t is corpus but its constructor was not called since from_numpy
+ # calls just class method __new__, call it here to set default values
+ # for attributes such as _titles, _tokens, preprocessors, text_features
+ c = Corpus(t.domain, t.X, t.Y, t.metas, t.W, ids=t.ids)
return c
@classmethod
def from_list(cls, domain, rows, weights=None):
- c = super().from_list(domain, rows, weights)
- c._set_unique_titles()
+ t = super().from_list(domain, rows, weights)
+ # t is corpus but its constructor was not called since from_numpy
+ # calls just class method __new__, call it here to set default values
+ # for attributes such as _titles, _tokens, preprocessors, text_features
+ c = Corpus(t.domain, t.X, t.Y, t.metas, t.W, ids=t.ids)
return c
@classmethod
diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py
index 11e86bb6f..79f1e1672 100644
--- a/orangecontrib/text/tests/test_corpus.py
+++ b/orangecontrib/text/tests/test_corpus.py
@@ -53,6 +53,32 @@ def test_corpus_from_file_with_tab(self):
c2 = Corpus.from_file('book-excerpts.tab')
self.assertEqual(c, c2)
+ def test_corpus_from_numpy(self):
+ domain = Domain(
+ [], metas=[StringVariable("title"), StringVariable("a")]
+ )
+ corpus = Corpus.from_numpy(
+ domain,
+ np.empty((2, 0)),
+ metas=np.array([["title1", "a"], ["title2", "b"]])
+ )
+ self.assertEqual(2, len(corpus))
+ assert_array_equal(["Document 1", "Document 2"], corpus.titles)
+ self.assertListEqual([StringVariable("title")], corpus.text_features)
+ self.assertIsNone(corpus._tokens)
+ self.assertListEqual([], corpus.used_preprocessor.preprocessors)
+
+ def test_corpus_from_list(self):
+ domain = Domain(
+ [], metas=[StringVariable("title"), StringVariable("a")]
+ )
+ corpus = Corpus.from_list(domain, [["title1", "a"], ["title2", "b"]])
+ self.assertEqual(2, len(corpus))
+ assert_array_equal(["Document 1", "Document 2"], corpus.titles)
+ self.assertListEqual([StringVariable("title")], corpus.text_features)
+ self.assertIsNone(corpus._tokens)
+ self.assertListEqual([], corpus.used_preprocessor.preprocessors)
+
def test_corpus_from_file_missing(self):
with self.assertRaises(FileNotFoundError):
Corpus.from_file('missing_file')
From a1e87a1b167fac2260ad4644f8e7e4c9fd02b72b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Mon, 8 Mar 2021 12:28:12 +0100
Subject: [PATCH 2/2] Corpus: show error when no string variables at all not
when text_features empty
---
orangecontrib/text/widgets/owcorpus.py | 8 +++--
.../text/widgets/tests/test_owcorpus.py | 36 ++++++++++++++++++-
2 files changed, 40 insertions(+), 4 deletions(-)
diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py
index cd56ee24f..2d4aa6a4f 100644
--- a/orangecontrib/text/widgets/owcorpus.py
+++ b/orangecontrib/text/widgets/owcorpus.py
@@ -82,7 +82,7 @@ def __init__(self):
# Used Text Features
fbox = gui.widgetBox(self.controlArea, orientation=0)
- ubox = gui.widgetBox(fbox, "Used text features", addSpace=False)
+ ubox = gui.widgetBox(fbox, "Used text features")
self.used_attrs_model = VariableListModel(enable_dnd=True)
self.used_attrs_view = VariablesListItemView()
self.used_attrs_view.setModel(self.used_attrs_model)
@@ -94,7 +94,7 @@ def __init__(self):
aa.rowsRemoved.connect(self.update_feature_selection)
# Ignored Text Features
- ibox = gui.widgetBox(fbox, "Ignored text features", addSpace=False)
+ ibox = gui.widgetBox(fbox, "Ignored text features")
self.unused_attrs_model = VariableListModel(enable_dnd=True)
self.unused_attrs_view = VariablesListItemView()
self.unused_attrs_view.setModel(self.unused_attrs_model)
@@ -146,6 +146,7 @@ def _load_corpus(path: str, data: Table, state: TaskState) -> Corpus:
def open_file(self, path=None, data=None):
self.closeContext()
self.Error.clear()
+ self.cancel()
self.unused_attrs_model[:] = []
self.used_attrs_model[:] = []
self.start(self._load_corpus, path, data)
@@ -158,7 +159,8 @@ def on_done(self, corpus: Corpus) -> None:
self.update_output_info()
self._setup_title_dropdown()
self.used_attrs = list(self.corpus.text_features)
- if not self.corpus.text_features:
+ all_str_features = [f for f in self.corpus.domain.metas if f.is_string]
+ if not all_str_features:
self.Error.corpus_without_text_features()
self.Outputs.corpus.send(None)
return
diff --git a/orangecontrib/text/widgets/tests/test_owcorpus.py b/orangecontrib/text/widgets/tests/test_owcorpus.py
index 0896f9893..5ac4c8faa 100644
--- a/orangecontrib/text/widgets/tests/test_owcorpus.py
+++ b/orangecontrib/text/widgets/tests/test_owcorpus.py
@@ -11,7 +11,7 @@
class TestOWCorpus(WidgetTest):
def setUp(self):
- self.widget = self.create_widget(OWCorpus)
+ self.widget: OWCorpus = self.create_widget(OWCorpus)
def check_output(self, sel_title):
"""
@@ -286,6 +286,40 @@ def test_keep_selected_variables(self):
self.wait_until_finished()
self.assertListEqual(list(prew_selected), self.widget.used_attrs)
+ def test_no_text_feature(self):
+ """
+ Test with data which have empty text_features. Widget should not show
+ the error but, should have all features unused.
+ """
+ # widget already loads book-excerpts from file and store context
+ # settings this call restore context settings to default otherwise
+ # Text variable is moved to used_attributes by the context
+ self.widget.settingsHandler.reset_to_original(self.widget)
+ data = Corpus.from_file("book-excerpts")
+ data.text_features = []
+ self.send_signal(self.widget.Inputs.data, data)
+ self.wait_until_finished()
+ self.assertFalse(
+ self.widget.Error.corpus_without_text_features.is_shown()
+ )
+ self.assertEqual(0, len(list(self.widget.used_attrs_model)))
+ self.assertListEqual(
+ [data.domain["Text"]],
+ list(self.widget.unused_attrs_model)
+ )
+
+ def test_corpus_without_text_features(self):
+ """
+ Test if corpus_without_text_features is correctly raised for data
+ without text features
+ """
+ data = Table("iris")
+ self.send_signal(self.widget.Inputs.data, data)
+ self.wait_until_finished()
+ self.assertTrue(
+ self.widget.Error.corpus_without_text_features.is_shown()
+ )
+
if __name__ == "__main__":
unittest.main()
\ No newline at end of file