Skip to content

Commit

Permalink
Corpus: Fix error with renamed text feature
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Sep 30, 2020
1 parent 2649629 commit fa6f03f
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 7 deletions.
54 changes: 47 additions & 7 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,16 @@
import scipy.sparse as sp
from gensim import corpora

from Orange.data import ContinuousVariable, DiscreteVariable, \
Domain, RowInstance, Table, StringVariable
from Orange.data import (
Variable,
ContinuousVariable,
DiscreteVariable,
Domain,
RowInstance,
Table,
StringVariable,
)
from Orange.preprocess.transformation import Identity
# uncomment when Orange3==3.27 is available
# from Orange.data.util import get_unique_names
# remove when Orange3==3.27 is available
Expand Down Expand Up @@ -102,17 +110,49 @@ def used_preprocessor(self, pp):
else:
raise NotImplementedError

def set_text_features(self, feats):
def _find_identical_feature(self, feature: Variable) -> Optional[Variable]:
"""
Find a renamed feature in the domain which is identical to a feature.
Parameters
----------
feature
A variable to find an identical variable in the domain.
Returns
-------
Variable which is identical to a feature (have different name but has
Identity(feature) in compute value.
"""
for var in chain(self.domain.variables, self.domain.metas):
if (
var == feature
or isinstance(var.compute_value, Identity)
and var.compute_value.variable == feature
):
return var
return None

def set_text_features(self, feats: Optional[List[Variable]]) -> None:
"""
Select which meta-attributes to include when mining text.
Args:
feats (list or None): List of text features to include. If None infer them.
Parameters
----------
feats
List of text features to include. If None infer them.
"""
if feats is not None:
for f in feats:
feats = copy(feats) # copy to not edit passed array inplace
for i, f in enumerate(feats):
if f not in chain(self.domain.variables, self.domain.metas):
raise ValueError('Feature "{}" not found.'.format(f))
# if not exact feature in the domain, it may be renamed
# find identity - renamed feature
id_feat = self._find_identical_feature(f)
if id_feat is not None:
feats[i] = id_feat
else:
raise ValueError('Feature "{}" not found.'.format(f))
if len(set(feats)) != len(feats):
raise ValueError('Text features must be unique.')
self.text_features = feats
Expand Down
11 changes: 11 additions & 0 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,17 @@ def test_extend_attribute_rename_existing(self):
)
self.assertEqual(0, len(c.domain.attributes))

def test_extend_attribute_rename_text_features(self):
"""
Test correctness of extending attributes, case when we want to rename
existing attributes
"""
# corpus without features
c = Corpus.from_file('book-excerpts')
X = np.random.random((len(c), 2))
new_c = c.extend_attributes(X, ['Text', '2',], rename_existing=True)
self.assertEqual(new_c.X.shape, (len(c), 2))

def test_corpus_not_eq(self):
c = Corpus.from_file('book-excerpts')
n_doc = c.X.shape[0]
Expand Down

0 comments on commit fa6f03f

Please sign in to comment.