Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX]Corpus fix from_numpy and from_list; modify widget to work with corpuses without text_features #627

Merged
merged 2 commits into from
Mar 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,14 +554,20 @@ def from_table(cls, domain, source, row_indices=...):

@classmethod
def from_numpy(cls, *args, **kwargs):
c = super().from_numpy(*args, **kwargs)
c._set_unique_titles()
t = super().from_numpy(*args, **kwargs)
# t is corpus but its constructor was not called since from_numpy
# calls just class method __new__, call it here to set default values
# for attributes such as _titles, _tokens, preprocessors, text_features
c = Corpus(t.domain, t.X, t.Y, t.metas, t.W, ids=t.ids)
return c

@classmethod
def from_list(cls, domain, rows, weights=None):
c = super().from_list(domain, rows, weights)
c._set_unique_titles()
t = super().from_list(domain, rows, weights)
# t is corpus but its constructor was not called since from_numpy
# calls just class method __new__, call it here to set default values
# for attributes such as _titles, _tokens, preprocessors, text_features
c = Corpus(t.domain, t.X, t.Y, t.metas, t.W, ids=t.ids)
return c

@classmethod
Expand Down
26 changes: 26 additions & 0 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,32 @@ def test_corpus_from_file_with_tab(self):
c2 = Corpus.from_file('book-excerpts.tab')
self.assertEqual(c, c2)

def test_corpus_from_numpy(self):
domain = Domain(
[], metas=[StringVariable("title"), StringVariable("a")]
)
corpus = Corpus.from_numpy(
domain,
np.empty((2, 0)),
metas=np.array([["title1", "a"], ["title2", "b"]])
)
self.assertEqual(2, len(corpus))
assert_array_equal(["Document 1", "Document 2"], corpus.titles)
self.assertListEqual([StringVariable("title")], corpus.text_features)
self.assertIsNone(corpus._tokens)
self.assertListEqual([], corpus.used_preprocessor.preprocessors)

def test_corpus_from_list(self):
domain = Domain(
[], metas=[StringVariable("title"), StringVariable("a")]
)
corpus = Corpus.from_list(domain, [["title1", "a"], ["title2", "b"]])
self.assertEqual(2, len(corpus))
assert_array_equal(["Document 1", "Document 2"], corpus.titles)
self.assertListEqual([StringVariable("title")], corpus.text_features)
self.assertIsNone(corpus._tokens)
self.assertListEqual([], corpus.used_preprocessor.preprocessors)

def test_corpus_from_file_missing(self):
with self.assertRaises(FileNotFoundError):
Corpus.from_file('missing_file')
Expand Down
8 changes: 5 additions & 3 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def __init__(self):

# Used Text Features
fbox = gui.widgetBox(self.controlArea, orientation=0)
ubox = gui.widgetBox(fbox, "Used text features", addSpace=False)
ubox = gui.widgetBox(fbox, "Used text features")
self.used_attrs_model = VariableListModel(enable_dnd=True)
self.used_attrs_view = VariablesListItemView()
self.used_attrs_view.setModel(self.used_attrs_model)
Expand All @@ -94,7 +94,7 @@ def __init__(self):
aa.rowsRemoved.connect(self.update_feature_selection)

# Ignored Text Features
ibox = gui.widgetBox(fbox, "Ignored text features", addSpace=False)
ibox = gui.widgetBox(fbox, "Ignored text features")
self.unused_attrs_model = VariableListModel(enable_dnd=True)
self.unused_attrs_view = VariablesListItemView()
self.unused_attrs_view.setModel(self.unused_attrs_model)
Expand Down Expand Up @@ -146,6 +146,7 @@ def _load_corpus(path: str, data: Table, state: TaskState) -> Corpus:
def open_file(self, path=None, data=None):
self.closeContext()
self.Error.clear()
self.cancel()
self.unused_attrs_model[:] = []
self.used_attrs_model[:] = []
self.start(self._load_corpus, path, data)
Expand All @@ -158,7 +159,8 @@ def on_done(self, corpus: Corpus) -> None:
self.update_output_info()
self._setup_title_dropdown()
self.used_attrs = list(self.corpus.text_features)
if not self.corpus.text_features:
all_str_features = [f for f in self.corpus.domain.metas if f.is_string]
if not all_str_features:
self.Error.corpus_without_text_features()
self.Outputs.corpus.send(None)
return
Expand Down
36 changes: 35 additions & 1 deletion orangecontrib/text/widgets/tests/test_owcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

class TestOWCorpus(WidgetTest):
def setUp(self):
self.widget = self.create_widget(OWCorpus)
self.widget: OWCorpus = self.create_widget(OWCorpus)

def check_output(self, sel_title):
"""
Expand Down Expand Up @@ -286,6 +286,40 @@ def test_keep_selected_variables(self):
self.wait_until_finished()
self.assertListEqual(list(prew_selected), self.widget.used_attrs)

def test_no_text_feature(self):
"""
Test with data which have empty text_features. Widget should not show
the error but, should have all features unused.
"""
# widget already loads book-excerpts from file and store context
# settings this call restore context settings to default otherwise
# Text variable is moved to used_attributes by the context
self.widget.settingsHandler.reset_to_original(self.widget)
PrimozGodec marked this conversation as resolved.
Show resolved Hide resolved
data = Corpus.from_file("book-excerpts")
data.text_features = []
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
self.assertFalse(
self.widget.Error.corpus_without_text_features.is_shown()
)
self.assertEqual(0, len(list(self.widget.used_attrs_model)))
self.assertListEqual(
[data.domain["Text"]],
list(self.widget.unused_attrs_model)
)

def test_corpus_without_text_features(self):
"""
Test if corpus_without_text_features is correctly raised for data
without text features
"""
data = Table("iris")
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
self.assertTrue(
self.widget.Error.corpus_without_text_features.is_shown()
)


if __name__ == "__main__":
unittest.main()