Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Refactor preprocessors #506

Merged
merged 3 commits into from
May 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 43 additions & 11 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from Orange.data import ContinuousVariable, DiscreteVariable, \
Domain, RowInstance, Table, StringVariable

from orangecontrib.text.vectorization import BowVectorizer


Expand Down Expand Up @@ -67,8 +68,10 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
self.ngram_range = (1, 1)
self.attributes = {}
self.pos_tags = None
self.used_preprocessor = None # required for compute values
from orangecontrib.text.preprocess import PreprocessorList
self.__used_preprocessor = PreprocessorList([]) # required for compute values
self._titles: Optional[np.ndarray] = None
self._pp_documents = None # preprocessed documents

if domain is not None and text_features is None:
self._infer_text_features()
Expand All @@ -81,6 +84,21 @@ def __init__(self, domain=None, X=None, Y=None, metas=None, W=None,
Table._init_ids(self)
self._set_unique_titles()

@property
def used_preprocessor(self):
return self.__used_preprocessor # type: PreprocessorList

@used_preprocessor.setter
def used_preprocessor(self, pp):
from orangecontrib.text.preprocess import PreprocessorList, Preprocessor

if isinstance(pp, PreprocessorList):
self.__used_preprocessor = PreprocessorList(list(pp.preprocessors))
elif isinstance(pp, Preprocessor):
self.__used_preprocessor.preprocessors.append(pp)
else:
raise NotImplementedError

def set_text_features(self, feats):
"""
Select which meta-attributes to include when mining text.
Expand Down Expand Up @@ -255,12 +273,19 @@ def extend_attributes(self, X, feature_names, feature_values=None,

@property
def documents(self):
"""
Returns: a list of strings representing documents — created by joining
selected text features.
"""
""" Returns a list of strings representing documents — created
by joining selected text features. """
return self.documents_from_features(self.text_features)

@property
def pp_documents(self):
""" Preprocessed documents (transformed). """
return self._pp_documents or self.documents

@pp_documents.setter
def pp_documents(self, documents):
self._pp_documents = documents

@property
def titles(self):
""" Returns a list of titles. """
Expand Down Expand Up @@ -298,27 +323,33 @@ def store_tokens(self, tokens, dictionary=None):
def tokens(self):
"""
np.ndarray: A list of lists containing tokens. If tokens are not yet
present, run default preprocessor and save tokens.
present, run default preprocessor and return tokens.
"""
if self._tokens is None:
self._apply_base_preprocessor()
return self._base_tokens()[0]
return self._tokens

def has_tokens(self):
""" Return whether corpus is preprocessed or not. """
return self._tokens is not None

def _apply_base_preprocessor(self):
from orangecontrib.text.preprocess import base_preprocessor
base_preprocessor(self)
def _base_tokens(self):
from orangecontrib.text.preprocess import BASE_TRANSFORMER, \
BASE_TOKENIZER, PreprocessorList

# don't use anything that requires NLTK data to assure async download
base_preprocessors = PreprocessorList([BASE_TRANSFORMER,
BASE_TOKENIZER])
corpus = base_preprocessors(self)
return corpus.tokens, corpus.dictionary

@property
def dictionary(self):
"""
corpora.Dictionary: A token to id mapper.
"""
if self._dictionary is None:
self._apply_base_preprocessor()
return self._base_tokens()[1]
return self._dictionary

def ngrams_iterator(self, join_with=' ', include_postags=False):
Expand Down Expand Up @@ -369,6 +400,7 @@ def copy(self):
c.name = self.name
c.used_preprocessor = self.used_preprocessor
c._titles = self._titles
c._pp_documents = self._pp_documents
return c

@staticmethod
Expand Down
33 changes: 22 additions & 11 deletions orangecontrib/text/preprocess/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,30 @@
>>> from orangecontrib.text import Corpus
>>> corpus = Corpus.from_file('book-excerpts')

And create a :class:`Preprocessor` objects with methods you want:
And create an instance of an arbitrary preprocessor:

>>> from orangecontrib.text import preprocess
>>> p = preprocess.Preprocessor(transformers=[preprocess.LowercaseTransformer()],
... tokenizer=preprocess.WordPunctTokenizer(),
... normalizer=preprocess.SnowballStemmer('english'),
... filters=[preprocess.StopwordsFilter('english'),
... preprocess.FrequencyFilter(min_df=.1)])
>>> p = preprocess.LowercaseTransformer()
>>> corpus = p(corpus)
>>> corpus.tokens[0][:10]
['the', 'house', 'jim', 'says', 'he', 'rum', ';', 'and', 'as', 'he']

Then you can apply you preprocessor to the corpus and access tokens via ``tokens`` attribute:

>>> new_corpus = p(corpus)
>>> new_corpus.tokens[0][:10]
You can also create a :class:`PreprocessorList` objects with preprocessors you want:

>>> from orangecontrib.text.preprocess import PreprocessorList
>>> pp_list = [preprocess.LowercaseTransformer(),
... preprocess.WordPunctTokenizer(),
... preprocess.SnowballStemmer(),
... preprocess.StopwordsFilter(),
... preprocess.FrequencyFilter(min_df=.1)]
>>> p = PreprocessorList(pp_list)

Then you can apply you preprocessors to the corpus and access tokens via ``tokens`` attribute:

>>> corpus = Corpus.from_file('book-excerpts')
>>> corpus = p(corpus)
>>> corpus.tokens[0][:10]
['hous', 'say', ';', 'spoke', 'littl', 'one', 'hand', 'wall', 'hurt', '?']


Expand All @@ -30,8 +41,8 @@
['human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications']

"""
from .preprocess import *
from .tokenize import *
from .filter import *
from .normalize import *
from .tokenize import *
from .transform import *
from .preprocess import *
Loading