Skip to content

Commit

Permalink
Corpus & Bow: Improve sparsity handling as of Orange 3.8
Browse files Browse the repository at this point in the history
  • Loading branch information
nikicc committed Dec 4, 2017
1 parent 2de6265 commit 2d2a215
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 8 deletions.
13 changes: 6 additions & 7 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
"""
n_doc = _check_arrays(X, Y, metas)

self.X = X if X is not None else sp.csr_matrix((n_doc, 0)) # prefer sparse (BoW compute values)
self.X = X if X is not None else np.zeros((n_doc, 0))
self.Y = Y if Y is not None else np.zeros((n_doc, 0))
self.metas = metas if metas is not None else np.zeros((n_doc, 0))
self.W = W if W is not None else np.zeros((n_doc, 0))
Expand Down Expand Up @@ -155,7 +155,7 @@ def extend_corpus(self, metadata, Y):
self._tokens = None # invalidate tokens

def extend_attributes(self, X, feature_names, feature_values=None,
compute_values=None, var_attrs=None):
compute_values=None, var_attrs=None, sparse=False):
"""
Append features to corpus. If `feature_values` argument is present,
features will be Discrete else Continuous.
Expand All @@ -166,6 +166,7 @@ def extend_attributes(self, X, feature_names, feature_values=None,
feature_values (list): A list of possible values for Discrete features.
compute_values (list): Compute values for corresponding features.
var_attrs (dict): Additional attributes appended to variable.attributes.
sparse (bool): Whether the features should be marked as sparse.
"""
if self.X.size == 0:
self.X = X
Expand All @@ -185,7 +186,8 @@ def extend_attributes(self, X, feature_names, feature_values=None,
var = DiscreteVariable(f, values=values, compute_value=cv)
else:
var = ContinuousVariable(f, compute_value=cv)
if cv is not None: # set original variable for cv
var.sparse = sparse # don't pass this to constructor so this works with Orange < 3.8.0
if cv is not None: # set original variable for cv
cv.variable = var
if isinstance(var_attrs, dict):
var.attributes.update(var_attrs)
Expand Down Expand Up @@ -408,10 +410,7 @@ def from_file(cls, filename):
filename = abs_path

table = Table.from_file(filename)
X = table.X
if not sp.issparse(X) and X.size == 0:
X = sp.csr_matrix(X) # prefer sparse (BoW compute values)
return cls(table.domain, X, table.Y, table.metas, table.W)
return cls(table.domain, table.X, table.Y, table.metas, table.W)

@staticmethod
def retain_preprocessing(orig, new, key=...):
Expand Down
3 changes: 2 additions & 1 deletion orangecontrib/text/vectorization/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def add_features(corpus, X, dictionary, compute_values=None, var_attrs=None):
corpus.extend_attributes(X[:, order],
feature_names=(dictionary[i] for i in order),
var_attrs=variable_attrs,
compute_values=compute_values)
compute_values=compute_values,
sparse=True)
corpus.ngrams_corpus = matutils.Sparse2Corpus(X.T)


Expand Down

0 comments on commit 2d2a215

Please sign in to comment.