Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Corpus - from_table: keep text feature when renamed #585

Merged
merged 1 commit into from
Oct 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,7 @@ def retain_preprocessing(orig, new, key=...):
if isinstance(orig, Corpus):
if isinstance(key, tuple): # get row selection
key = key[0]

if orig._tokens is not None: # retain preprocessing
if isinstance(key, Integral):
new._tokens = np.array([orig._tokens[key]])
Expand All @@ -606,9 +607,22 @@ def retain_preprocessing(orig, new, key=...):
else:
raise TypeError('Indexing by type {} not supported.'.format(type(key)))
new._dictionary = orig._dictionary

if isinstance(new, Corpus):
# _find_identical_feature returns non when feature not found
# filter this Nones from list
new.text_features = list(filter(None, [
new._find_identical_feature(tf)
for tf in orig.text_features
]))
else:
new.text_features = [
tf
for tf in orig.text_features
if tf in set(new.domain.metas)
]

new._titles = orig._titles[key]
new_domain_metas = set(new.domain.metas)
new.text_features = [tf for tf in orig.text_features if tf in new_domain_metas]
new.ngram_range = orig.ngram_range
new.attributes = orig.attributes
new.used_preprocessor = orig.used_preprocessor
Expand Down
12 changes: 12 additions & 0 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,18 @@ def test_from_table(self):
np.testing.assert_equal(t.metas, c.metas)
self.assertEqual(c.text_features, [t.domain.metas[0]])

def test_from_table_renamed(self):
c1 = Corpus.from_file('book-excerpts')
new_domain = Domain(c1.domain.attributes, metas=[c1.domain.metas[0].renamed("text1")])

# when text feature renamed
c2 = Corpus.from_table(new_domain, c1)
self.assertIsInstance(c2, Corpus)
self.assertEqual(len(c1), len(c2))
np.testing.assert_equal(c1.metas, c2.metas)
self.assertEqual(1, len(c2.text_features))
self.assertEqual("text1", c2.text_features[0].name)

def test_infer_text_features(self):
c = Corpus.from_file('friends-transcripts')
tf = c.text_features
Expand Down