Skip to content

Commit

Permalink
Implement Sparse2CorpusSliceable and use it istead of Sparse2Corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Oct 11, 2021
1 parent 3dce948 commit 6617c2c
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 4 deletions.
3 changes: 2 additions & 1 deletion orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,8 @@ def retain_preprocessing(orig, new, key=...):
new.ngram_range = orig.ngram_range
new.attributes = orig.attributes
new.used_preprocessor = orig.used_preprocessor
new.ngrams_corpus = orig._ngrams_corpus
if orig._ngrams_corpus is not None:
new.ngrams_corpus = orig._ngrams_corpus[key]

def __eq__(self, other):
def arrays_equal(a, b):
Expand Down
30 changes: 29 additions & 1 deletion orangecontrib/text/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import numpy as np
import scipy.sparse as sp
from numpy.testing import assert_array_equal
from scipy.sparse import csc_matrix

from orangecontrib.text.util import chunks, np_sp_sum
from orangecontrib.text.util import chunks, np_sp_sum, Sparse2CorpusSliceable


class ChunksTest(unittest.TestCase):
Expand All @@ -28,3 +30,29 @@ def test_np_sp_sum(self):
self.assertEqual(np_sp_sum(data), 10)
np.testing.assert_equal(np_sp_sum(data, axis=1), np.ones(10))
np.testing.assert_equal(np_sp_sum(data, axis=0), np.ones(10))


class TestSparse2CorpusSliceable(unittest.TestCase):
def setUp(self) -> None:
self.orig_array = np.array([[1, 2, 3], [4, 5, 6]])
self.s2c = Sparse2CorpusSliceable(csc_matrix(self.orig_array))

def test_slice(self):
assert_array_equal(self.s2c[:2].sparse.toarray(), self.orig_array[:, :2])
assert_array_equal(self.s2c[1:3].sparse.toarray(), self.orig_array[:, 1:3])

def test_index(self):
assert_array_equal(self.s2c[1].sparse.toarray(), self.orig_array[:, 1])

def test_list_of_indices(self):
assert_array_equal(
self.s2c[[1, 2]].sparse.toarray(), self.orig_array[:, [1, 2]]
)
assert_array_equal(self.s2c[[1]].sparse.toarray(), self.orig_array[:, [1]])

def test_elipsis(self):
assert_array_equal(self.s2c[...].sparse.toarray(), self.orig_array)


if __name__ == "__main__":
unittest.main()
32 changes: 32 additions & 0 deletions orangecontrib/text/util.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from functools import wraps
from math import ceil
from typing import Union, List

import numpy as np
import scipy.sparse as sp
from gensim.matutils import Sparse2Corpus
from scipy.sparse import csc_matrix


def chunks(iterable, chunk_size):
""" Splits iterable objects into chunk of fixed size.
Expand Down Expand Up @@ -59,3 +63,31 @@ def np_sp_sum(x, axis=None):
return r
else:
return np.sum(x, axis=axis)


class Sparse2CorpusSliceable(Sparse2Corpus):
"""
Sparse2Corpus support only retrieving a vector for single document.
This class implements slice operation on the Sparse2Corpus object.
Todo: this implementation is temporary, remove it when/if implemented in gensim
"""

def __getitem__(
self, key: Union[int, List[int], type(...), slice]
) -> Sparse2Corpus:
"""Retrieve a document vector from the corpus by its index.
Parameters
----------
key
Index of document or slice for documents
Returns
-------
Selected subset of sparse data from self.
"""
if not isinstance(key, (int, list, type(...), slice)):
raise TypeError(f"Indexing by type {type(key)} not supported.")
sparse = self.sparse.__getitem__((slice(None, None, None), key))
return Sparse2CorpusSliceable(sparse)
5 changes: 3 additions & 2 deletions orangecontrib/text/vectorization/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from itertools import chain

import numpy as np
from gensim import matutils

from Orange.data.util import SharedComputeValue
from Orange.data import Domain
Expand All @@ -13,6 +12,8 @@
# remove following section when orange3=3.27 is available
import re

from orangecontrib.text.util import Sparse2CorpusSliceable

RE_FIND_INDEX = r"(^{})( \((\d{{1,}})\))?$"


Expand Down Expand Up @@ -81,7 +82,7 @@ def add_features(corpus, X, dictionary, compute_values=None, var_attrs=None):
sparse=True,
rename_existing=True
)
corpus.ngrams_corpus = matutils.Sparse2Corpus(X.T)
corpus.ngrams_corpus = Sparse2CorpusSliceable(X.T)
return corpus


Expand Down

0 comments on commit 6617c2c

Please sign in to comment.