diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py index 83cbf58c2..8c0e5efcc 100644 --- a/orangecontrib/text/corpus.py +++ b/orangecontrib/text/corpus.py @@ -591,6 +591,7 @@ def retain_preprocessing(orig, new, key=...): if isinstance(orig, Corpus): if isinstance(key, tuple): # get row selection key = key[0] + if orig._tokens is not None: # retain preprocessing if isinstance(key, Integral): new._tokens = np.array([orig._tokens[key]]) @@ -606,9 +607,22 @@ def retain_preprocessing(orig, new, key=...): else: raise TypeError('Indexing by type {} not supported.'.format(type(key))) new._dictionary = orig._dictionary + + if isinstance(new, Corpus): + # _find_identical_feature returns non when feature not found + # filter this Nones from list + new.text_features = list(filter(None, [ + new._find_identical_feature(tf) + for tf in orig.text_features + ])) + else: + new.text_features = [ + tf + for tf in orig.text_features + if tf in set(new.domain.metas) + ] + new._titles = orig._titles[key] - new_domain_metas = set(new.domain.metas) - new.text_features = [tf for tf in orig.text_features if tf in new_domain_metas] new.ngram_range = orig.ngram_range new.attributes = orig.attributes new.used_preprocessor = orig.used_preprocessor diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py index f2a5db090..44cc432d6 100644 --- a/orangecontrib/text/tests/test_corpus.py +++ b/orangecontrib/text/tests/test_corpus.py @@ -180,6 +180,18 @@ def test_from_table(self): np.testing.assert_equal(t.metas, c.metas) self.assertEqual(c.text_features, [t.domain.metas[0]]) + def test_from_table_renamed(self): + c1 = Corpus.from_file('book-excerpts') + new_domain = Domain(c1.domain.attributes, metas=[c1.domain.metas[0].renamed("text1")]) + + # when text feature renamed + c2 = Corpus.from_table(new_domain, c1) + self.assertIsInstance(c2, Corpus) + self.assertEqual(len(c1), len(c2)) + np.testing.assert_equal(c1.metas, c2.metas) + self.assertEqual(1, len(c2.text_features)) + self.assertEqual("text1", c2.text_features[0].name) + def test_infer_text_features(self): c = Corpus.from_file('friends-transcripts') tf = c.text_features